Skip to content

Commit 0b115b6

Browse files
hanishakonerubharatviswa504
authored andcommitted
HDDS-1371. OMSnapshotProvider to download DB checkpoint from leader OM. (#703)
1 parent 3ea4f41 commit 0b115b6

File tree

17 files changed

+723
-55
lines changed

17 files changed

+723
-55
lines changed

hadoop-hdds/common/src/main/java/org/apache/hadoop/ozone/OzoneConsts.java

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,12 @@ public final class OzoneConsts {
7676
public static final String OZONE_USER = "user";
7777
public static final String OZONE_REQUEST = "request";
7878

79+
// OM Http server endpoints
80+
public static final String OZONE_OM_SERVICE_LIST_HTTP_ENDPOINT =
81+
"/serviceList";
82+
public static final String OZONE_OM_DB_CHECKPOINT_HTTP_ENDPOINT =
83+
"/dbCheckpoint";
84+
7985
// Ozone File System scheme
8086
public static final String OZONE_URI_SCHEME = "o3fs";
8187

@@ -286,4 +292,9 @@ private OzoneConsts() {
286292

287293
// OM Ratis snapshot file to store the last applied index
288294
public static final String OM_RATIS_SNAPSHOT_INDEX = "ratisSnapshotIndex";
295+
296+
// OM Http request parameter to be used while downloading DB checkpoint
297+
// from OM leader to follower
298+
public static final String OM_RATIS_SNAPSHOT_BEFORE_DB_CHECKPOINT =
299+
"snapshotBeforeCheckpoint";
289300
}

hadoop-hdds/common/src/main/java/org/apache/hadoop/utils/db/DBCheckpoint.java

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,4 +55,20 @@ public interface DBCheckpoint {
5555
*/
5656
void cleanupCheckpoint() throws IOException;
5757

58+
/**
59+
* Set the OM Ratis snapshot index corresponding to the OM DB checkpoint.
60+
* The snapshot index is the latest snapshot index saved by ratis
61+
* snapshots. It is not guaranteed to be the last ratis index applied to
62+
* the OM DB state.
63+
* @param omRatisSnapshotIndex the saved ratis snapshot index
64+
*/
65+
void setRatisSnapshotIndex(long omRatisSnapshotIndex);
66+
67+
/**
68+
* Get the OM Ratis snapshot index corresponding to the OM DB checkpoint.
69+
* The ratis snapshot index indicates upto which index is definitely
70+
* included in the DB checkpoint. It is not guaranteed to be the last ratis
71+
* log index applied to the DB checkpoint.
72+
*/
73+
long getRatisSnapshotIndex();
5874
}

hadoop-hdds/common/src/main/java/org/apache/hadoop/utils/db/RocksDBCheckpoint.java

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@ public class RocksDBCheckpoint implements DBCheckpoint {
3838
private long checkpointTimestamp = System.currentTimeMillis();
3939
private long latestSequenceNumber = -1;
4040
private long checkpointCreationTimeTaken = 0L;
41+
private long ratisSnapshotIndex = 0L;
4142

4243
public RocksDBCheckpoint(Path checkpointLocation) {
4344
this.checkpointLocation = checkpointLocation;
@@ -78,4 +79,14 @@ public void cleanupCheckpoint() throws IOException {
7879
LOG.debug("Cleaning up checkpoint at " + checkpointLocation.toString());
7980
FileUtils.deleteDirectory(checkpointLocation.toFile());
8081
}
82+
83+
@Override
84+
public void setRatisSnapshotIndex(long omRatisSnapshotIndex) {
85+
this.ratisSnapshotIndex = omRatisSnapshotIndex;
86+
}
87+
88+
@Override
89+
public long getRatisSnapshotIndex() {
90+
return ratisSnapshotIndex;
91+
}
8192
}

hadoop-hdds/common/src/main/resources/ozone-default.xml

Lines changed: 41 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1585,6 +1585,8 @@
15851585
logs. If this is not set then default metadata dirs is used. A warning
15861586
will be logged if this not set. Ideally, this should be mapped to a
15871587
fast disk like an SSD.
1588+
If undefined, OM ratis storage dir will fallback to ozone.metadata.dirs.
1589+
This fallback approach is not recommended for production environments.
15881590
</description>
15891591
</property>
15901592

@@ -1703,6 +1705,45 @@
17031705
.</description>
17041706
</property>
17051707

1708+
<property>
1709+
<name>ozone.om.ratis.snapshot.dir</name>
1710+
<value/>
1711+
<tag>OZONE, OM, STORAGE, MANAGEMENT, RATIS</tag>
1712+
<description>This directory is used for storing OM's snapshot
1713+
related files like the ratisSnapshotIndex and DB checkpoint from leader
1714+
OM.
1715+
If undefined, OM snapshot dir will fallback to ozone.om.ratis.storage.dir.
1716+
This fallback approach is not recommended for production environments.
1717+
</description>
1718+
</property>
1719+
<property>
1720+
<name>ozone.om.snapshot.provider.socket.timeout</name>
1721+
<value>5000s</value>
1722+
<tag>OZONE, OM, HA, MANAGEMENT</tag>
1723+
<description>
1724+
Socket timeout for HTTP call made by OM Snapshot Provider to request
1725+
OM snapshot from OM Leader.
1726+
</description>
1727+
</property>
1728+
<property>
1729+
<name>ozone.om.snapshot.provider.connection.timeout</name>
1730+
<value>5000s</value>
1731+
<tag>OZONE, OM, HA, MANAGEMENT</tag>
1732+
<description>
1733+
Connection timeout for HTTP call made by OM Snapshot Provider to request
1734+
OM snapshot from OM Leader.
1735+
</description>
1736+
</property>
1737+
<property>
1738+
<name>ozone.om.snapshot.provider.request.timeout</name>
1739+
<value>5000ms</value>
1740+
<tag>OZONE, OM, HA, MANAGEMENT</tag>
1741+
<description>
1742+
Connection request timeout for HTTP call made by OM Snapshot Provider to
1743+
request OM snapshot from OM Leader.
1744+
</description>
1745+
</property>
1746+
17061747
<property>
17071748
<name>ozone.acl.authorizer.class</name>
17081749
<value>org.apache.hadoop.ozone.security.acl.OzoneAccessAuthorizer</value>
@@ -2346,14 +2387,6 @@
23462387
OM snapshot.
23472388
</description>
23482389
</property>
2349-
<property>
2350-
<name>recon.om.socket.timeout</name>
2351-
<value>5s</value>
2352-
<tag>OZONE, RECON, OM</tag>
2353-
<description>
2354-
Socket timeout for HTTP call made by Recon to request OM snapshot.
2355-
</description>
2356-
</property>
23572390
<property>
23582391
<name>recon.om.snapshot.task.initial.delay</name>
23592392
<value>1m</value>

hadoop-ozone/client/src/main/java/org/apache/hadoop/ozone/client/rest/RestClient.java

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -99,6 +99,7 @@
9999

100100
import static java.net.HttpURLConnection.HTTP_CREATED;
101101
import static java.net.HttpURLConnection.HTTP_OK;
102+
import static org.apache.hadoop.ozone.OzoneConsts.OZONE_OM_SERVICE_LIST_HTTP_ENDPOINT;
102103

103104
/**
104105
* Ozone Client REST protocol implementation. It uses REST protocol to
@@ -190,7 +191,8 @@ private InetSocketAddress getOzoneRestServerAddress(
190191
" details on configuring Ozone.");
191192
}
192193

193-
HttpGet httpGet = new HttpGet("http://" + httpAddress + "/serviceList");
194+
HttpGet httpGet = new HttpGet("http://" + httpAddress +
195+
OZONE_OM_SERVICE_LIST_HTTP_ENDPOINT);
194196
HttpEntity entity = executeHttpRequest(httpGet);
195197
try {
196198
String serviceListJson = EntityUtils.toString(entity);

hadoop-ozone/common/src/main/java/org/apache/hadoop/ozone/OmUtils.java

Lines changed: 105 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,19 +27,23 @@
2727
import java.net.InetSocketAddress;
2828
import java.nio.charset.StandardCharsets;
2929
import java.nio.file.Path;
30+
import java.nio.file.Paths;
3031
import java.security.MessageDigest;
3132
import java.security.NoSuchAlgorithmException;
3233
import java.util.Collection;
3334
import java.util.Collections;
3435
import java.util.Optional;
3536
import java.util.zip.GZIPOutputStream;
3637

38+
import com.google.common.base.Strings;
3739
import org.apache.commons.compress.archivers.tar.TarArchiveEntry;
3840
import org.apache.commons.compress.archivers.tar.TarArchiveOutputStream;
3941
import org.apache.commons.compress.utils.IOUtils;
4042
import org.apache.commons.lang3.RandomStringUtils;
43+
import org.apache.commons.lang3.StringUtils;
4144
import org.apache.hadoop.conf.Configuration;
4245
import org.apache.hadoop.hdds.scm.ScmUtils;
46+
import org.apache.hadoop.hdds.scm.HddsServerUtil;
4347
import org.apache.hadoop.net.NetUtils;
4448
import org.apache.hadoop.ozone.om.OMConfigKeys;
4549
import org.apache.hadoop.ozone.protocol.proto.OzoneManagerProtocolProtos;
@@ -48,7 +52,11 @@
4852
import static org.apache.hadoop.hdds.HddsUtils.getPortNumberFromConfigKeys;
4953
import static org.apache.hadoop.ozone.om.OMConfigKeys.OZONE_OM_ADDRESS_KEY;
5054
import static org.apache.hadoop.ozone.om.OMConfigKeys.OZONE_OM_BIND_HOST_DEFAULT;
55+
import static org.apache.hadoop.ozone.om.OMConfigKeys.OZONE_OM_HTTPS_ADDRESS_KEY;
56+
import static org.apache.hadoop.ozone.om.OMConfigKeys.OZONE_OM_HTTPS_BIND_HOST_KEY;
57+
import static org.apache.hadoop.ozone.om.OMConfigKeys.OZONE_OM_HTTPS_BIND_PORT_DEFAULT;
5158
import static org.apache.hadoop.ozone.om.OMConfigKeys.OZONE_OM_HTTP_ADDRESS_KEY;
59+
import static org.apache.hadoop.ozone.om.OMConfigKeys.OZONE_OM_HTTP_BIND_HOST_KEY;
5260
import static org.apache.hadoop.ozone.om.OMConfigKeys.OZONE_OM_HTTP_BIND_PORT_DEFAULT;
5361
import static org.apache.hadoop.ozone.om.OMConfigKeys.OZONE_OM_NODES_KEY;
5462
import static org.apache.hadoop.ozone.om.OMConfigKeys.OZONE_OM_PORT_DEFAULT;
@@ -366,4 +374,101 @@ private static void addFilesToArchive(String source, File file,
366374
}
367375
}
368376

377+
/**
378+
* If a OM conf is only set with key suffixed with OM Node ID, return the
379+
* set value.
380+
* @return null if base conf key is set, otherwise the value set for
381+
* key suffixed with Node ID.
382+
*/
383+
public static String getConfSuffixedWithOMNodeId(Configuration conf,
384+
String confKey, String omServiceID, String omNodeId) {
385+
String confValue = conf.getTrimmed(confKey);
386+
if (StringUtils.isNotEmpty(confValue)) {
387+
return null;
388+
}
389+
String suffixedConfKey = OmUtils.addKeySuffixes(
390+
confKey, omServiceID, omNodeId);
391+
confValue = conf.getTrimmed(suffixedConfKey);
392+
if (StringUtils.isNotEmpty(confValue)) {
393+
return confValue;
394+
}
395+
return null;
396+
}
397+
398+
/**
399+
* Returns the http address of peer OM node.
400+
* @param conf Configuration
401+
* @param omNodeId peer OM node ID
402+
* @param omNodeHostAddr peer OM node host address
403+
* @return http address of peer OM node in the format <hostName>:<port>
404+
*/
405+
public static String getHttpAddressForOMPeerNode(Configuration conf,
406+
String omServiceId, String omNodeId, String omNodeHostAddr) {
407+
final Optional<String> bindHost = getHostNameFromConfigKeys(conf,
408+
addKeySuffixes(OZONE_OM_HTTP_BIND_HOST_KEY, omServiceId, omNodeId));
409+
410+
final Optional<Integer> addressPort = getPortNumberFromConfigKeys(conf,
411+
addKeySuffixes(OZONE_OM_HTTP_ADDRESS_KEY, omServiceId, omNodeId));
412+
413+
final Optional<String> addressHost = getHostNameFromConfigKeys(conf,
414+
addKeySuffixes(OZONE_OM_HTTP_ADDRESS_KEY, omServiceId, omNodeId));
415+
416+
String hostName = bindHost.orElse(addressHost.orElse(omNodeHostAddr));
417+
418+
return hostName + ":" + addressPort.orElse(OZONE_OM_HTTP_BIND_PORT_DEFAULT);
419+
}
420+
421+
/**
422+
* Returns the https address of peer OM node.
423+
* @param conf Configuration
424+
* @param omNodeId peer OM node ID
425+
* @param omNodeHostAddr peer OM node host address
426+
* @return https address of peer OM node in the format <hostName>:<port>
427+
*/
428+
public static String getHttpsAddressForOMPeerNode(Configuration conf,
429+
String omServiceId, String omNodeId, String omNodeHostAddr) {
430+
final Optional<String> bindHost = getHostNameFromConfigKeys(conf,
431+
addKeySuffixes(OZONE_OM_HTTPS_BIND_HOST_KEY, omServiceId, omNodeId));
432+
433+
final Optional<Integer> addressPort = getPortNumberFromConfigKeys(conf,
434+
addKeySuffixes(OZONE_OM_HTTPS_ADDRESS_KEY, omServiceId, omNodeId));
435+
436+
final Optional<String> addressHost = getHostNameFromConfigKeys(conf,
437+
addKeySuffixes(OZONE_OM_HTTPS_ADDRESS_KEY, omServiceId, omNodeId));
438+
439+
String hostName = bindHost.orElse(addressHost.orElse(omNodeHostAddr));
440+
441+
return hostName + ":" +
442+
addressPort.orElse(OZONE_OM_HTTPS_BIND_PORT_DEFAULT);
443+
}
444+
445+
/**
446+
* Get the local directory where ratis logs will be stored.
447+
*/
448+
public static String getOMRatisDirectory(Configuration conf) {
449+
String storageDir = conf.get(OMConfigKeys.OZONE_OM_RATIS_STORAGE_DIR);
450+
451+
if (Strings.isNullOrEmpty(storageDir)) {
452+
storageDir = HddsServerUtil.getDefaultRatisDirectory(conf);
453+
}
454+
return storageDir;
455+
}
456+
457+
public static String getOMRatisSnapshotDirectory(Configuration conf) {
458+
String snapshotDir = conf.get(OMConfigKeys.OZONE_OM_RATIS_SNAPSHOT_DIR);
459+
460+
if (Strings.isNullOrEmpty(snapshotDir)) {
461+
snapshotDir = Paths.get(getOMRatisDirectory(conf),
462+
"snapshot").toString();
463+
}
464+
return snapshotDir;
465+
}
466+
467+
public static File createOMDir(String dirPath) {
468+
File dirFile = new File(dirPath);
469+
if (!dirFile.exists() && !dirFile.mkdirs()) {
470+
throw new IllegalArgumentException("Unable to create path: " + dirFile);
471+
}
472+
return dirFile;
473+
}
369474
}

hadoop-ozone/common/src/main/java/org/apache/hadoop/ozone/om/OMConfigKeys.java

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -184,6 +184,27 @@ private OMConfigKeys() {
184184
OZONE_OM_RATIS_SERVER_ROLE_CHECK_INTERVAL_DEFAULT
185185
= TimeDuration.valueOf(15, TimeUnit.SECONDS);
186186

187+
// OM SnapshotProvider configurations
188+
public static final String OZONE_OM_RATIS_SNAPSHOT_DIR =
189+
"ozone.om.ratis.snapshot.dir";
190+
public static final String OZONE_OM_SNAPSHOT_PROVIDER_SOCKET_TIMEOUT_KEY =
191+
"ozone.om.snapshot.provider.socket.timeout";
192+
public static final TimeDuration
193+
OZONE_OM_SNAPSHOT_PROVIDER_SOCKET_TIMEOUT_DEFAULT =
194+
TimeDuration.valueOf(5000, TimeUnit.MILLISECONDS);
195+
196+
public static final String OZONE_OM_SNAPSHOT_PROVIDER_CONNECTION_TIMEOUT_KEY =
197+
"ozone.om.snapshot.provider.connection.timeout";
198+
public static final TimeDuration
199+
OZONE_OM_SNAPSHOT_PROVIDER_CONNECTION_TIMEOUT_DEFAULT =
200+
TimeDuration.valueOf(5000, TimeUnit.MILLISECONDS);
201+
202+
public static final String OZONE_OM_SNAPSHOT_PROVIDER_REQUEST_TIMEOUT_KEY =
203+
"ozone.om.snapshot.provider.request.timeout";
204+
public static final TimeDuration
205+
OZONE_OM_SNAPSHOT_PROVIDER_REQUEST_TIMEOUT_DEFAULT =
206+
TimeDuration.valueOf(5000, TimeUnit.MILLISECONDS);
207+
187208
public static final String OZONE_OM_KERBEROS_KEYTAB_FILE_KEY = "ozone.om."
188209
+ "kerberos.keytab.file";
189210
public static final String OZONE_OM_KERBEROS_PRINCIPAL_KEY = "ozone.om"

hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/MiniOzoneHAClusterImpl.java

Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -200,18 +200,17 @@ private Map<String, OzoneManager> createOMService() throws IOException,
200200
// Set nodeId
201201
String nodeId = nodeIdBaseStr + i;
202202
conf.set(OMConfigKeys.OZONE_OM_NODE_ID_KEY, nodeId);
203+
// Set the OM http(s) address to null so that the cluster picks
204+
// up the address set with service ID and node ID in initHAConfig
205+
conf.set(OMConfigKeys.OZONE_OM_HTTP_ADDRESS_KEY, "");
206+
conf.set(OMConfigKeys.OZONE_OM_HTTPS_ADDRESS_KEY, "");
203207

204208
// Set metadata/DB dir base path
205209
String metaDirPath = path + "/" + nodeId;
206210
conf.set(OZONE_METADATA_DIRS, metaDirPath);
207211
OMStorage omStore = new OMStorage(conf);
208212
initializeOmStorage(omStore);
209213

210-
// Set HTTP address to the rpc port + 2
211-
int httpPort = basePort + (6*i) - 4;
212-
conf.set(OMConfigKeys.OZONE_OM_HTTP_ADDRESS_KEY,
213-
"127.0.0.1:" + httpPort);
214-
215214
OzoneManager om = OzoneManager.createOm(null, conf);
216215
om.setCertClient(certClient);
217216
omMap.put(nodeId, om);
@@ -261,11 +260,16 @@ private void initHAConfig(int basePort) throws IOException {
261260
omNodesKeyValue.append(",").append(omNodeId);
262261
String omAddrKey = OmUtils.addKeySuffixes(
263262
OMConfigKeys.OZONE_OM_ADDRESS_KEY, omServiceId, omNodeId);
263+
String omHttpAddrKey = OmUtils.addKeySuffixes(
264+
OMConfigKeys.OZONE_OM_HTTP_ADDRESS_KEY, omServiceId, omNodeId);
265+
String omHttpsAddrKey = OmUtils.addKeySuffixes(
266+
OMConfigKeys.OZONE_OM_HTTPS_ADDRESS_KEY, omServiceId, omNodeId);
264267
String omRatisPortKey = OmUtils.addKeySuffixes(
265268
OMConfigKeys.OZONE_OM_RATIS_PORT_KEY, omServiceId, omNodeId);
266269

267270
conf.set(omAddrKey, "127.0.0.1:" + port);
268-
// Reserve port+2 for OMs HTTP server
271+
conf.set(omHttpAddrKey, "127.0.0.1:" + (port + 2));
272+
conf.set(omHttpsAddrKey, "127.0.0.1:" + (port + 3));
269273
conf.setInt(omRatisPortKey, port + 4);
270274
}
271275

0 commit comments

Comments
 (0)