Skip to content

Commit 7ed2cb9

Browse files
authored
HBASE-27567 Introduce ChaosMonkey Action to print HDFS Cluster status
Signed-off-by: Reid Chan <[email protected]> Signed-off-by: Duo Zhang <[email protected]>
1 parent 2a7c69d commit 7ed2cb9

File tree

5 files changed

+202
-39
lines changed

5 files changed

+202
-39
lines changed
Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,78 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one
3+
* or more contributor license agreements. See the NOTICE file
4+
* distributed with this work for additional information
5+
* regarding copyright ownership. The ASF licenses this file
6+
* to you under the Apache License, Version 2.0 (the
7+
* "License"); you may not use this file except in compliance
8+
* with the License. You may obtain a copy of the License at
9+
*
10+
* http://www.apache.org/licenses/LICENSE-2.0
11+
*
12+
* Unless required by applicable law or agreed to in writing, software
13+
* distributed under the License is distributed on an "AS IS" BASIS,
14+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15+
* See the License for the specific language governing permissions and
16+
* limitations under the License.
17+
*/
18+
package org.apache.hadoop.hbase.chaos.actions;
19+
20+
import java.net.InetSocketAddress;
21+
import java.net.URI;
22+
import java.util.List;
23+
import org.apache.commons.io.FileUtils;
24+
import org.apache.hadoop.conf.Configuration;
25+
import org.apache.hadoop.hdfs.DistributedFileSystem;
26+
import org.apache.hadoop.hdfs.HAUtil;
27+
import org.apache.hadoop.hdfs.HAUtilClient;
28+
import org.apache.hadoop.hdfs.protocol.ClientProtocol;
29+
import org.apache.hadoop.hdfs.protocol.DatanodeInfo;
30+
import org.apache.hadoop.hdfs.protocol.HdfsConstants;
31+
import org.slf4j.Logger;
32+
import org.slf4j.LoggerFactory;
33+
34+
public class DumpHdfsClusterStatusAction extends Action {
35+
private static final Logger LOG = LoggerFactory.getLogger(DumpHdfsClusterStatusAction.class);
36+
private static final String PREFIX = "\n ";
37+
38+
@Override
39+
protected Logger getLogger() {
40+
return LOG;
41+
}
42+
43+
@Override
44+
public void perform() throws Exception {
45+
StringBuilder sb = new StringBuilder();
46+
try (final DistributedFileSystem dfs = HdfsActionUtils.createDfs(getConf())) {
47+
final Configuration dfsConf = dfs.getConf();
48+
final URI dfsUri = dfs.getUri();
49+
final boolean isHaAndLogicalUri = HAUtilClient.isLogicalUri(dfsConf, dfsUri);
50+
sb.append("Cluster status").append('\n');
51+
if (isHaAndLogicalUri) {
52+
final String nsId = dfsUri.getHost();
53+
final List<ClientProtocol> namenodes =
54+
HAUtil.getProxiesForAllNameNodesInNameservice(dfsConf, nsId);
55+
final boolean atLeastOneActive = HAUtil.isAtLeastOneActive(namenodes);
56+
final InetSocketAddress activeAddress = HAUtil.getAddressOfActive(dfs);
57+
sb.append("Active NameNode=").append(activeAddress).append(", isAtLeastOneActive=")
58+
.append(atLeastOneActive).append('\n');
59+
}
60+
DatanodeInfo[] dns = dfs.getClient().datanodeReport(HdfsConstants.DatanodeReportType.LIVE);
61+
sb.append("Number of live DataNodes: ").append(dns.length);
62+
for (DatanodeInfo dni : dns) {
63+
sb.append(PREFIX).append("name=").append(dni.getName()).append(", used%=")
64+
.append(dni.getDfsUsedPercent()).append(", capacity=")
65+
.append(FileUtils.byteCountToDisplaySize(dni.getCapacity()));
66+
}
67+
sb.append('\n');
68+
dns = dfs.getClient().datanodeReport(HdfsConstants.DatanodeReportType.DEAD);
69+
sb.append("Number of dead DataNodes: ").append(dns.length);
70+
for (DatanodeInfo dni : dns) {
71+
sb.append(PREFIX).append(dni.getName()).append("/").append(dni.getNetworkLocation());
72+
}
73+
}
74+
// TODO: add more on NN, JNs, and ZK.
75+
// TODO: Print how long process has been up.
76+
getLogger().info(sb.toString());
77+
}
78+
}
Lines changed: 73 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,73 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one
3+
* or more contributor license agreements. See the NOTICE file
4+
* distributed with this work for additional information
5+
* regarding copyright ownership. The ASF licenses this file
6+
* to you under the Apache License, Version 2.0 (the
7+
* "License"); you may not use this file except in compliance
8+
* with the License. You may obtain a copy of the License at
9+
*
10+
* http://www.apache.org/licenses/LICENSE-2.0
11+
*
12+
* Unless required by applicable law or agreed to in writing, software
13+
* distributed under the License is distributed on an "AS IS" BASIS,
14+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15+
* See the License for the specific language governing permissions and
16+
* limitations under the License.
17+
*/
18+
package org.apache.hadoop.hbase.chaos.actions;
19+
20+
import java.io.IOException;
21+
import java.io.InterruptedIOException;
22+
import java.security.PrivilegedExceptionAction;
23+
import org.apache.hadoop.conf.Configuration;
24+
import org.apache.hadoop.fs.FileSystem;
25+
import org.apache.hadoop.fs.Path;
26+
import org.apache.hadoop.hbase.util.CommonFSUtils;
27+
import org.apache.hadoop.hdfs.DistributedFileSystem;
28+
import org.apache.hadoop.security.UserGroupInformation;
29+
30+
/**
31+
* Configuration common across the HDFS Actions.
32+
*/
33+
public final class HdfsActionUtils {
34+
35+
private HdfsActionUtils() {
36+
}
37+
38+
/**
39+
* Specify a user as whom HDFS actions should be run. The chaos process must have permissions
40+
* sufficient to assume the role of the specified user.
41+
* @see <a href=
42+
* "https://hadoop.apache.org/docs/stable/hadoop-project-dist/hadoop-common/Superusers.html">Proxy
43+
* user - Superusers Acting On Behalf Of Other Users</a>
44+
*/
45+
public static final String HDFS_USER_CONF_KEY = "org.apache.hadoop.hbase.chaos.actions.hdfs_user";
46+
47+
private static DistributedFileSystem createUnproxiedDfs(final Configuration conf)
48+
throws IOException {
49+
final Path rootDir = CommonFSUtils.getRootDir(conf);
50+
final FileSystem fs = rootDir.getFileSystem(conf);
51+
return (DistributedFileSystem) fs;
52+
}
53+
54+
/**
55+
* Create an instance of {@link DistributedFileSystem} that honors {@value HDFS_USER_CONF_KEY}.
56+
*/
57+
static DistributedFileSystem createDfs(final Configuration conf) throws IOException {
58+
final String proxyUser = conf.get(HDFS_USER_CONF_KEY);
59+
if (proxyUser == null) {
60+
return createUnproxiedDfs(conf);
61+
}
62+
final UserGroupInformation proxyUgi =
63+
UserGroupInformation.createProxyUser(proxyUser, UserGroupInformation.getLoginUser());
64+
try {
65+
return proxyUgi
66+
.doAs((PrivilegedExceptionAction<DistributedFileSystem>) () -> createUnproxiedDfs(conf));
67+
} catch (InterruptedException e) {
68+
final InterruptedIOException iioe = new InterruptedIOException(e.getMessage());
69+
iioe.setStackTrace(e.getStackTrace());
70+
throw iioe;
71+
}
72+
}
73+
}

hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/RestartActiveNameNodeAction.java

Lines changed: 39 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -17,15 +17,17 @@
1717
*/
1818
package org.apache.hadoop.hbase.chaos.actions;
1919

20+
import java.util.Collections;
2021
import java.util.List;
22+
import java.util.Optional;
2123
import org.apache.hadoop.conf.Configuration;
2224
import org.apache.hadoop.hbase.ServerName;
23-
import org.apache.hadoop.hbase.util.CommonFSUtils;
2425
import org.apache.hadoop.hbase.zookeeper.RecoverableZooKeeper;
2526
import org.apache.hadoop.hbase.zookeeper.ZKUtil;
2627
import org.apache.hadoop.hbase.zookeeper.ZKWatcher;
2728
import org.apache.hadoop.hbase.zookeeper.ZNodePaths;
2829
import org.apache.hadoop.hdfs.DFSUtil;
30+
import org.apache.hadoop.hdfs.DistributedFileSystem;
2931
import org.apache.hadoop.hdfs.HAUtil;
3032
import org.apache.hadoop.hdfs.server.namenode.ha.proto.HAZKInfoProtos.ActiveNodeInfo;
3133
import org.slf4j.Logger;
@@ -57,39 +59,51 @@ protected Logger getLogger() {
5759
@Override
5860
public void perform() throws Exception {
5961
getLogger().info("Performing action: Restart active namenode");
60-
Configuration conf = CommonFSUtils.getRootDir(getConf()).getFileSystem(getConf()).getConf();
61-
String nameServiceID = DFSUtil.getNamenodeNameServiceId(conf);
62-
if (!HAUtil.isHAEnabled(conf, nameServiceID)) {
63-
throw new Exception("HA for namenode is not enabled");
64-
}
65-
ZKWatcher zkw = null;
66-
RecoverableZooKeeper rzk = null;
62+
63+
final String hadoopHAZkNode;
6764
String activeNamenode = null;
68-
String hadoopHAZkNode = conf.get(ZK_PARENT_ZNODE_KEY, ZK_PARENT_ZNODE_DEFAULT);
69-
try {
70-
zkw = new ZKWatcher(conf, "get-active-namenode", null);
71-
rzk = zkw.getRecoverableZooKeeper();
72-
String hadoopHAZkNodePath = ZNodePaths.joinZNode(hadoopHAZkNode, nameServiceID);
73-
List<String> subChildern = ZKUtil.listChildrenNoWatch(zkw, hadoopHAZkNodePath);
74-
for (String eachEntry : subChildern) {
75-
if (eachEntry.contains(ACTIVE_NN_LOCK_NAME)) {
65+
int activeNamenodePort = -1;
66+
try (final DistributedFileSystem dfs = HdfsActionUtils.createDfs(getConf())) {
67+
final Configuration conf = dfs.getConf();
68+
hadoopHAZkNode = conf.get(ZK_PARENT_ZNODE_KEY, ZK_PARENT_ZNODE_DEFAULT);
69+
final String nameServiceID = DFSUtil.getNamenodeNameServiceId(conf);
70+
71+
if (!HAUtil.isHAEnabled(conf, nameServiceID)) {
72+
getLogger().info("HA for HDFS is not enabled; skipping");
73+
return;
74+
}
75+
try (final ZKWatcher zkw = new ZKWatcher(conf, "get-active-namenode", null)) {
76+
final RecoverableZooKeeper rzk = zkw.getRecoverableZooKeeper();
77+
// If hadoopHAZkNode == '/', pass '' instead because then joinZNode will return '//' as a
78+
// prefix
79+
// which zk doesn't like as a prefix on the path.
80+
final String hadoopHAZkNodePath = ZNodePaths.joinZNode(
81+
(hadoopHAZkNode != null && hadoopHAZkNode.equals("/")) ? "" : hadoopHAZkNode,
82+
nameServiceID);
83+
final List<String> subChildren =
84+
Optional.ofNullable(ZKUtil.listChildrenNoWatch(zkw, hadoopHAZkNodePath))
85+
.orElse(Collections.emptyList());
86+
for (final String eachEntry : subChildren) {
87+
if (!eachEntry.contains(ACTIVE_NN_LOCK_NAME)) {
88+
continue;
89+
}
7690
byte[] data =
7791
rzk.getData(ZNodePaths.joinZNode(hadoopHAZkNodePath, ACTIVE_NN_LOCK_NAME), false, null);
7892
ActiveNodeInfo proto = ActiveNodeInfo.parseFrom(data);
7993
activeNamenode = proto.getHostname();
94+
activeNamenodePort = proto.getPort();
8095
}
8196
}
82-
} finally {
83-
if (zkw != null) {
84-
zkw.close();
85-
}
8697
}
98+
8799
if (activeNamenode == null) {
88-
throw new Exception("No active Name node found in zookeeper under " + hadoopHAZkNode);
100+
getLogger().info("No active Name node found in zookeeper under '{}'", hadoopHAZkNode);
101+
return;
89102
}
90-
getLogger().info("Found active namenode host:" + activeNamenode);
91-
ServerName activeNNHost = ServerName.valueOf(activeNamenode, -1, -1);
92-
getLogger().info("Restarting Active NameNode :" + activeNamenode);
93-
restartNameNode(activeNNHost, sleepTime);
103+
104+
getLogger().info("Found Active NameNode host: {}", activeNamenode);
105+
final ServerName activeNNHost = ServerName.valueOf(activeNamenode, activeNamenodePort, -1L);
106+
getLogger().info("Restarting Active NameNode: {}", activeNamenode);
107+
restartNameNode(activeNNHost, this.sleepTime);
94108
}
95109
}

hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/RestartRandomDataNodeAction.java

Lines changed: 7 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -18,14 +18,11 @@
1818
package org.apache.hadoop.hbase.chaos.actions;
1919

2020
import java.io.IOException;
21-
import java.util.ArrayList;
22-
import java.util.List;
21+
import java.util.Arrays;
2322
import org.apache.hadoop.hbase.ServerName;
2423
import org.apache.hadoop.hbase.chaos.monkies.PolicyBasedChaosMonkey;
25-
import org.apache.hadoop.hbase.util.CommonFSUtils;
2624
import org.apache.hadoop.hdfs.DFSClient;
2725
import org.apache.hadoop.hdfs.DistributedFileSystem;
28-
import org.apache.hadoop.hdfs.protocol.DatanodeInfo;
2926
import org.apache.hadoop.hdfs.protocol.HdfsConstants;
3027
import org.slf4j.Logger;
3128
import org.slf4j.LoggerFactory;
@@ -48,18 +45,15 @@ protected Logger getLogger() {
4845
@Override
4946
public void perform() throws Exception {
5047
getLogger().info("Performing action: Restart random data node");
51-
ServerName server = PolicyBasedChaosMonkey.selectRandomItem(getDataNodes());
48+
final ServerName server = PolicyBasedChaosMonkey.selectRandomItem(getDataNodes());
5249
restartDataNode(server, sleepTime);
5350
}
5451

55-
public ServerName[] getDataNodes() throws IOException {
56-
DistributedFileSystem fs =
57-
(DistributedFileSystem) CommonFSUtils.getRootDir(getConf()).getFileSystem(getConf());
58-
DFSClient dfsClient = fs.getClient();
59-
List<ServerName> hosts = new ArrayList<>();
60-
for (DatanodeInfo dataNode : dfsClient.datanodeReport(HdfsConstants.DatanodeReportType.LIVE)) {
61-
hosts.add(ServerName.valueOf(dataNode.getHostName(), -1, -1));
52+
private ServerName[] getDataNodes() throws IOException {
53+
try (final DistributedFileSystem dfs = HdfsActionUtils.createDfs(getConf())) {
54+
final DFSClient dfsClient = dfs.getClient();
55+
return Arrays.stream(dfsClient.datanodeReport(HdfsConstants.DatanodeReportType.LIVE))
56+
.map(dn -> ServerName.valueOf(dn.getHostName(), -1, -1)).toArray(ServerName[]::new);
6257
}
63-
return hosts.toArray(new ServerName[0]);
6458
}
6559
}

hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/factories/ServerAndDependenciesKillingMonkeyFactory.java

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,9 +19,11 @@
1919

2020
import org.apache.hadoop.hbase.chaos.actions.Action;
2121
import org.apache.hadoop.hbase.chaos.actions.DumpClusterStatusAction;
22+
import org.apache.hadoop.hbase.chaos.actions.DumpHdfsClusterStatusAction;
2223
import org.apache.hadoop.hbase.chaos.actions.ForceBalancerAction;
2324
import org.apache.hadoop.hbase.chaos.actions.GracefulRollingRestartRsAction;
2425
import org.apache.hadoop.hbase.chaos.actions.RestartActiveMasterAction;
26+
import org.apache.hadoop.hbase.chaos.actions.RestartActiveNameNodeAction;
2527
import org.apache.hadoop.hbase.chaos.actions.RestartRandomDataNodeAction;
2628
import org.apache.hadoop.hbase.chaos.actions.RestartRandomRsExceptMetaAction;
2729
import org.apache.hadoop.hbase.chaos.actions.RestartRandomZKNodeAction;
@@ -55,6 +57,7 @@ public ChaosMonkey build() {
5557
// only allow 2 servers to be dead.
5658
new RollingBatchRestartRsAction(5000, 1.0f, 2, true),
5759
new ForceBalancerAction(),
60+
new RestartActiveNameNodeAction(60000),
5861
new RestartRandomDataNodeAction(60000),
5962
new RestartRandomZKNodeAction(60000),
6063
new GracefulRollingRestartRsAction(gracefulRollingRestartTSSLeepTime),
@@ -64,7 +67,8 @@ public ChaosMonkey build() {
6467
// @formatter:on
6568

6669
// Action to log more info for debugging
67-
Action[] actions2 = new Action[] { new DumpClusterStatusAction() };
70+
Action[] actions2 =
71+
new Action[] { new DumpClusterStatusAction(), new DumpHdfsClusterStatusAction() };
6872

6973
return new PolicyBasedChaosMonkey(properties, util,
7074
new CompositeSequentialPolicy(new DoActionsOncePolicy(60 * 1000, actions1),

0 commit comments

Comments
 (0)