Skip to content

Commit e15be23

Browse files
HBASE-28690 Aborting Active HMaster is not rejecting reportRegionStateTransition if procedure is initialised by next Active master (#6180)
Added masterActiveTime as fencing token for remote procedures Signed-off-by: Duo Zhang <[email protected]> Reviewed-by: Aman Poonia <[email protected]>
1 parent bb3cfbb commit e15be23

27 files changed

+182
-64
lines changed

hbase-client/src/main/java/org/apache/hadoop/hbase/shaded/protobuf/ProtobufUtil.java

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2986,11 +2986,13 @@ public static CloseRegionRequest buildCloseRegionRequest(ServerName server, byte
29862986

29872987
public static CloseRegionRequest buildCloseRegionRequest(ServerName server, byte[] regionName,
29882988
ServerName destinationServer) {
2989-
return buildCloseRegionRequest(server, regionName, destinationServer, -1);
2989+
// this method is used when we are bypassing active HMaster, so we don't have procId or master
2990+
// active time.
2991+
return buildCloseRegionRequest(server, regionName, destinationServer, -1, -1);
29902992
}
29912993

29922994
public static CloseRegionRequest buildCloseRegionRequest(ServerName server, byte[] regionName,
2993-
ServerName destinationServer, long closeProcId) {
2995+
ServerName destinationServer, long closeProcId, long initiatingMasterActiveTime) {
29942996
CloseRegionRequest.Builder builder = CloseRegionRequest.newBuilder();
29952997
RegionSpecifier region =
29962998
RequestConverter.buildRegionSpecifier(RegionSpecifierType.REGION_NAME, regionName);
@@ -3001,6 +3003,7 @@ public static CloseRegionRequest buildCloseRegionRequest(ServerName server, byte
30013003
if (server != null) {
30023004
builder.setServerStartCode(server.getStartcode());
30033005
}
3006+
builder.setInitiatingMasterActiveTime(initiatingMasterActiveTime);
30043007
builder.setCloseProcId(closeProcId);
30053008
return builder.build();
30063009
}

hbase-procedure/src/main/java/org/apache/hadoop/hbase/procedure2/RemoteProcedureDispatcher.java

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -222,14 +222,22 @@ protected final void submitTask(Runnable task, long delay, TimeUnit unit) {
222222
*/
223223
public static abstract class RemoteOperation {
224224
private final RemoteProcedure remoteProcedure;
225+
// active time of the master that sent this request, used for fencing
226+
private final long initiatingMasterActiveTime;
225227

226-
protected RemoteOperation(final RemoteProcedure remoteProcedure) {
228+
protected RemoteOperation(final RemoteProcedure remoteProcedure,
229+
long initiatingMasterActiveTime) {
227230
this.remoteProcedure = remoteProcedure;
231+
this.initiatingMasterActiveTime = initiatingMasterActiveTime;
228232
}
229233

230234
public RemoteProcedure getRemoteProcedure() {
231235
return remoteProcedure;
232236
}
237+
238+
public long getInitiatingMasterActiveTime() {
239+
return initiatingMasterActiveTime;
240+
}
233241
}
234242

235243
/**

hbase-protocol-shaded/src/main/protobuf/Admin.proto

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -80,6 +80,8 @@ message OpenRegionRequest {
8080
repeated RegionOpenInfo open_info = 1;
8181
// the intended server for this RPC.
8282
optional uint64 serverStartCode = 2;
83+
// Master active time as fencing token
84+
optional int64 initiating_master_active_time = 3;
8385
// wall clock time from master
8486
optional uint64 master_system_time = 5;
8587

@@ -122,6 +124,8 @@ message CloseRegionRequest {
122124
// the intended server for this RPC.
123125
optional uint64 serverStartCode = 5;
124126
optional int64 close_proc_id = 6 [default = -1];
127+
// Master active time as fencing token
128+
optional int64 initiating_master_active_time = 8;
125129
}
126130

127131
message CloseRegionResponse {
@@ -271,6 +275,8 @@ message RemoteProcedureRequest {
271275
required uint64 proc_id = 1;
272276
required string proc_class = 2;
273277
optional bytes proc_data = 3;
278+
// Master active time as fencing token
279+
optional int64 initiating_master_active_time = 4;
274280
}
275281

276282
message ExecuteProceduresRequest {

hbase-protocol-shaded/src/main/protobuf/RegionServerStatus.proto

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -97,6 +97,9 @@ message RegionStateTransition {
9797
optional uint64 open_seq_num = 3;
9898

9999
repeated int64 proc_id = 4;
100+
101+
// Master active time as fencing token
102+
optional int64 initiating_master_active_time = 5;
100103
enum TransitionCode {
101104
OPENED = 0;
102105
FAILED_OPEN = 1;
@@ -155,6 +158,8 @@ message RemoteProcedureResult {
155158
}
156159
required Status status = 2;
157160
optional ForeignExceptionMessage error = 3;
161+
// Master active time as fencing token
162+
optional int64 initiating_master_active_time = 4;
158163
}
159164
message ReportProcedureDoneRequest {
160165
repeated RemoteProcedureResult result = 1;

hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3036,6 +3036,7 @@ public long getMasterStartTime() {
30363036
}
30373037

30383038
/** Returns timestamp in millis when HMaster became the active master. */
3039+
@Override
30393040
public long getMasterActiveTime() {
30403041
return masterActiveTime;
30413042
}

hbase-server/src/main/java/org/apache/hadoop/hbase/master/MasterRpcServices.java

Lines changed: 31 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@
4141
import org.apache.hadoop.hbase.ClusterMetricsBuilder;
4242
import org.apache.hadoop.hbase.DoNotRetryIOException;
4343
import org.apache.hadoop.hbase.HConstants;
44+
import org.apache.hadoop.hbase.MasterNotRunningException;
4445
import org.apache.hadoop.hbase.MetaTableAccessor;
4546
import org.apache.hadoop.hbase.NamespaceDescriptor;
4647
import org.apache.hadoop.hbase.Server;
@@ -69,7 +70,6 @@
6970
import org.apache.hadoop.hbase.ipc.RpcServer.BlockingServiceAndInterface;
7071
import org.apache.hadoop.hbase.ipc.RpcServerFactory;
7172
import org.apache.hadoop.hbase.ipc.RpcServerInterface;
72-
import org.apache.hadoop.hbase.ipc.ServerNotRunningYetException;
7373
import org.apache.hadoop.hbase.ipc.ServerRpcController;
7474
import org.apache.hadoop.hbase.master.assignment.AssignmentManager;
7575
import org.apache.hadoop.hbase.master.assignment.RegionStateNode;
@@ -327,6 +327,7 @@
327327
import org.apache.hadoop.hbase.shaded.protobuf.generated.QuotaProtos.GetSpaceQuotaRegionSizesResponse;
328328
import org.apache.hadoop.hbase.shaded.protobuf.generated.QuotaProtos.GetSpaceQuotaRegionSizesResponse.RegionSizes;
329329
import org.apache.hadoop.hbase.shaded.protobuf.generated.RecentLogs;
330+
import org.apache.hadoop.hbase.shaded.protobuf.generated.RegionServerStatusProtos;
330331
import org.apache.hadoop.hbase.shaded.protobuf.generated.RegionServerStatusProtos.FileArchiveNotificationRequest;
331332
import org.apache.hadoop.hbase.shaded.protobuf.generated.RegionServerStatusProtos.FileArchiveNotificationResponse;
332333
import org.apache.hadoop.hbase.shaded.protobuf.generated.RegionServerStatusProtos.GetLastFlushedSequenceIdRequest;
@@ -1695,6 +1696,15 @@ public ReportRegionStateTransitionResponse reportRegionStateTransition(RpcContro
16951696
ReportRegionStateTransitionRequest req) throws ServiceException {
16961697
try {
16971698
master.checkServiceStarted();
1699+
for (RegionServerStatusProtos.RegionStateTransition transition : req.getTransitionList()) {
1700+
long procId =
1701+
transition.getProcIdCount() > 0 ? transition.getProcId(0) : Procedure.NO_PROC_ID;
1702+
// -1 is less than any possible MasterActiveCode
1703+
long initiatingMasterActiveTime = transition.hasInitiatingMasterActiveTime()
1704+
? transition.getInitiatingMasterActiveTime()
1705+
: -1;
1706+
throwOnOldMaster(procId, initiatingMasterActiveTime);
1707+
}
16981708
return master.getAssignmentManager().reportRegionStateTransition(req);
16991709
} catch (IOException ioe) {
17001710
throw new ServiceException(ioe);
@@ -2379,8 +2389,14 @@ public ReportProcedureDoneResponse reportProcedureDone(RpcController controller,
23792389
// Check Masters is up and ready for duty before progressing. Remote side will keep trying.
23802390
try {
23812391
this.master.checkServiceStarted();
2382-
} catch (ServerNotRunningYetException snrye) {
2383-
throw new ServiceException(snrye);
2392+
for (RemoteProcedureResult result : request.getResultList()) {
2393+
// -1 is less than any possible MasterActiveCode
2394+
long initiatingMasterActiveTime =
2395+
result.hasInitiatingMasterActiveTime() ? result.getInitiatingMasterActiveTime() : -1;
2396+
throwOnOldMaster(result.getProcId(), initiatingMasterActiveTime);
2397+
}
2398+
} catch (IOException ioe) {
2399+
throw new ServiceException(ioe);
23842400
}
23852401
request.getResultList().forEach(result -> {
23862402
if (result.getStatus() == RemoteProcedureResult.Status.SUCCESS) {
@@ -2393,6 +2409,18 @@ public ReportProcedureDoneResponse reportProcedureDone(RpcController controller,
23932409
return ReportProcedureDoneResponse.getDefaultInstance();
23942410
}
23952411

2412+
private void throwOnOldMaster(long procId, long initiatingMasterActiveTime)
2413+
throws MasterNotRunningException {
2414+
if (initiatingMasterActiveTime > master.getMasterActiveTime()) {
2415+
// procedure is initiated by new active master but report received on master with older active
2416+
// time
2417+
LOG.warn(
2418+
"Report for procId: {} and initiatingMasterAT {} received on master with activeTime {}",
2419+
procId, initiatingMasterActiveTime, master.getMasterActiveTime());
2420+
throw new MasterNotRunningException("Another master is active");
2421+
}
2422+
}
2423+
23962424
// HBCK Services
23972425

23982426
@Override

hbase-server/src/main/java/org/apache/hadoop/hbase/master/MasterServices.java

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -261,6 +261,9 @@ long splitRegion(final RegionInfo regionInfo, final byte[] splitRow, final long
261261
/** Returns true if master is the active one */
262262
boolean isActiveMaster();
263263

264+
/** Returns timestamp in millis when this master became the active one. */
265+
long getMasterActiveTime();
266+
264267
/** Returns true if master is initialized */
265268
boolean isInitialized();
266269

hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/CloseRegionProcedure.java

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -61,8 +61,9 @@ public TableOperationType getTableOperationType() {
6161
}
6262

6363
@Override
64-
public RemoteOperation newRemoteOperation() {
65-
return new RegionCloseOperation(this, region, getProcId(), assignCandidate);
64+
public RemoteOperation newRemoteOperation(MasterProcedureEnv env) {
65+
return new RegionCloseOperation(this, region, getProcId(), assignCandidate,
66+
env.getMasterServices().getMasterActiveTime());
6667
}
6768

6869
@Override

hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/OpenRegionProcedure.java

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -57,8 +57,9 @@ public TableOperationType getTableOperationType() {
5757
}
5858

5959
@Override
60-
public RemoteOperation newRemoteOperation() {
61-
return new RegionOpenOperation(this, region, getProcId());
60+
public RemoteOperation newRemoteOperation(MasterProcedureEnv env) {
61+
return new RegionOpenOperation(this, region, getProcId(),
62+
env.getMasterServices().getMasterActiveTime());
6263
}
6364

6465
@Override

hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionRemoteProcedureBase.java

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -92,10 +92,11 @@ public Optional<RemoteProcedureDispatcher.RemoteOperation> remoteCallBuild(Maste
9292
if (state == RegionRemoteProcedureBaseState.REGION_REMOTE_PROCEDURE_REPORT_SUCCEED) {
9393
return Optional.empty();
9494
}
95-
return Optional.of(newRemoteOperation());
95+
return Optional.of(newRemoteOperation(env));
9696
}
9797

98-
protected abstract RemoteProcedureDispatcher.RemoteOperation newRemoteOperation();
98+
protected abstract RemoteProcedureDispatcher.RemoteOperation
99+
newRemoteOperation(MasterProcedureEnv env);
99100

100101
@Override
101102
public void remoteOperationCompleted(MasterProcedureEnv env) {

0 commit comments

Comments
 (0)