Skip to content

Commit dbe98f3

Browse files
committed
HBASE-22460 : Reopen regions with very high Store Ref Counts
1 parent 5217618 commit dbe98f3

File tree

20 files changed

+1054
-4
lines changed

20 files changed

+1054
-4
lines changed

hbase-client/src/main/java/org/apache/hadoop/hbase/RegionMetrics.java

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -154,4 +154,10 @@ default String getNameAsString() {
154154
* @return the reference count for the stores of this region
155155
*/
156156
int getStoreRefCount();
157+
158+
/**
159+
* @return the max reference count for any store file among all stores files
160+
* of this region
161+
*/
162+
int getMaxStoreFileRefCount();
157163
}

hbase-client/src/main/java/org/apache/hadoop/hbase/RegionMetricsBuilder.java

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,7 @@ public static RegionMetrics toRegionMetrics(ClusterStatusProtos.RegionLoad regio
6666
.setStoreCount(regionLoadPB.getStores())
6767
.setStoreFileCount(regionLoadPB.getStorefiles())
6868
.setStoreRefCount(regionLoadPB.getStoreRefCount())
69+
.setMaxStoreFileRefCount(regionLoadPB.getMaxStoreFileRefCount())
6970
.setStoreFileSize(new Size(regionLoadPB.getStorefileSizeMB(), Size.Unit.MEGABYTE))
7071
.setStoreSequenceIds(regionLoadPB.getStoreCompleteSequenceIdList().stream()
7172
.collect(Collectors.toMap(
@@ -113,6 +114,7 @@ public static ClusterStatusProtos.RegionLoad toRegionLoad(RegionMetrics regionMe
113114
.setStores(regionMetrics.getStoreCount())
114115
.setStorefiles(regionMetrics.getStoreFileCount())
115116
.setStoreRefCount(regionMetrics.getStoreRefCount())
117+
.setMaxStoreFileRefCount(regionMetrics.getMaxStoreFileRefCount())
116118
.setStorefileSizeMB((int) regionMetrics.getStoreFileSize().get(Size.Unit.MEGABYTE))
117119
.addAllStoreCompleteSequenceId(toStoreSequenceId(regionMetrics.getStoreSequenceId()))
118120
.setStoreUncompressedSizeMB(
@@ -128,6 +130,7 @@ public static RegionMetricsBuilder newBuilder(byte[] name) {
128130
private int storeCount;
129131
private int storeFileCount;
130132
private int storeRefCount;
133+
private int maxStoreFileRefCount;
131134
private long compactingCellCount;
132135
private long compactedCellCount;
133136
private Size storeFileSize = Size.ZERO;
@@ -161,6 +164,10 @@ public RegionMetricsBuilder setStoreRefCount(int value) {
161164
this.storeRefCount = value;
162165
return this;
163166
}
167+
public RegionMetricsBuilder setMaxStoreFileRefCount(int value) {
168+
this.maxStoreFileRefCount = value;
169+
return this;
170+
}
164171
public RegionMetricsBuilder setCompactingCellCount(long value) {
165172
this.compactingCellCount = value;
166173
return this;
@@ -235,6 +242,7 @@ public RegionMetrics build() {
235242
storeCount,
236243
storeFileCount,
237244
storeRefCount,
245+
maxStoreFileRefCount,
238246
compactingCellCount,
239247
compactedCellCount,
240248
storeFileSize,
@@ -259,6 +267,7 @@ private static class RegionMetricsImpl implements RegionMetrics {
259267
private final int storeCount;
260268
private final int storeFileCount;
261269
private final int storeRefCount;
270+
private final int maxStoreFileRefCount;
262271
private final long compactingCellCount;
263272
private final long compactedCellCount;
264273
private final Size storeFileSize;
@@ -280,6 +289,7 @@ private static class RegionMetricsImpl implements RegionMetrics {
280289
int storeCount,
281290
int storeFileCount,
282291
int storeRefCount,
292+
int maxStoreFileRefCount,
283293
final long compactingCellCount,
284294
long compactedCellCount,
285295
Size storeFileSize,
@@ -301,6 +311,7 @@ private static class RegionMetricsImpl implements RegionMetrics {
301311
this.storeCount = storeCount;
302312
this.storeFileCount = storeFileCount;
303313
this.storeRefCount = storeRefCount;
314+
this.maxStoreFileRefCount = maxStoreFileRefCount;
304315
this.compactingCellCount = compactingCellCount;
305316
this.compactedCellCount = compactedCellCount;
306317
this.storeFileSize = Preconditions.checkNotNull(storeFileSize);
@@ -340,6 +351,11 @@ public int getStoreRefCount() {
340351
return storeRefCount;
341352
}
342353

354+
@Override
355+
public int getMaxStoreFileRefCount() {
356+
return maxStoreFileRefCount;
357+
}
358+
343359
@Override
344360
public Size getStoreFileSize() {
345361
return storeFileSize;
@@ -433,6 +449,8 @@ public String toString() {
433449
this.getStoreFileCount());
434450
Strings.appendKeyValue(sb, "storeRefCount",
435451
this.getStoreRefCount());
452+
Strings.appendKeyValue(sb, "maxStoreFileRefCount",
453+
this.getMaxStoreFileRefCount());
436454
Strings.appendKeyValue(sb, "uncompressedStoreFileSize",
437455
this.getUncompressedStoreFileSize());
438456
Strings.appendKeyValue(sb, "lastMajorCompactionTimestamp",

hbase-client/src/main/java/org/apache/hadoop/hbase/ServerMetricsBuilder.java

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -343,6 +343,8 @@ public long getLastReportTimestamp() {
343343
public String toString() {
344344
int storeCount = 0;
345345
int storeFileCount = 0;
346+
int storeRefCount = 0;
347+
int maxStoreFileRefCount = 0;
346348
long uncompressedStoreFileSizeMB = 0;
347349
long storeFileSizeMB = 0;
348350
long memStoreSizeMB = 0;
@@ -358,6 +360,9 @@ public String toString() {
358360
for (RegionMetrics r : getRegionMetrics().values()) {
359361
storeCount += r.getStoreCount();
360362
storeFileCount += r.getStoreFileCount();
363+
storeRefCount += r.getStoreRefCount();
364+
int currentMaxStoreFileRefCount = r.getMaxStoreFileRefCount();
365+
maxStoreFileRefCount = Math.max(maxStoreFileRefCount, currentMaxStoreFileRefCount);
361366
uncompressedStoreFileSizeMB += r.getUncompressedStoreFileSize().get(Size.Unit.MEGABYTE);
362367
storeFileSizeMB += r.getStoreFileSize().get(Size.Unit.MEGABYTE);
363368
memStoreSizeMB += r.getMemStoreSize().get(Size.Unit.MEGABYTE);
@@ -379,6 +384,8 @@ public String toString() {
379384
Strings.appendKeyValue(sb, "maxHeapMB", getMaxHeapSize());
380385
Strings.appendKeyValue(sb, "numberOfStores", storeCount);
381386
Strings.appendKeyValue(sb, "numberOfStorefiles", storeFileCount);
387+
Strings.appendKeyValue(sb, "storeRefCount", storeRefCount);
388+
Strings.appendKeyValue(sb, "maxStoreFileRefCount", maxStoreFileRefCount);
382389
Strings.appendKeyValue(sb, "storefileUncompressedSizeMB", uncompressedStoreFileSizeMB);
383390
Strings.appendKeyValue(sb, "storefileSizeMB", storeFileSizeMB);
384391
if (uncompressedStoreFileSizeMB != 0) {

hbase-common/src/main/java/org/apache/hadoop/hbase/HConstants.java

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1470,6 +1470,13 @@ public enum OperationStatusCode {
14701470
// User defined Default TTL config key
14711471
public static final String DEFAULT_SNAPSHOT_TTL_CONFIG_KEY = "hbase.master.snapshot.ttl";
14721472

1473+
// Regions Recovery based on high storeFileRefCount threshold value
1474+
public static final String STORE_FILE_REF_COUNT_THRESHOLD =
1475+
"hbase.regions.recovery.store.file.ref.count";
1476+
1477+
// default -1 indicates there is no threshold on high storeRefCount
1478+
public static final int DEFAULT_STORE_FILE_REF_COUNT_THRESHOLD = -1;
1479+
14731480
/**
14741481
* Configurations for master executor services.
14751482
*/

hbase-common/src/main/resources/hbase-default.xml

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1901,4 +1901,33 @@ possible configurations would overwhelm and obscure the important.
19011901
automatically deleted until it is manually deleted
19021902
</description>
19031903
</property>
1904+
<property>
1905+
<name>hbase.master.regions.recovery.check.interval</name>
1906+
<value>1200000</value>
1907+
<description>
1908+
Regions Recovery Chore interval in milliseconds.
1909+
This chore keeps running at this interval to
1910+
find all regions with configurable max store file ref count
1911+
and reopens them.
1912+
</description>
1913+
</property>
1914+
<property>
1915+
<name>hbase.regions.recovery.store.file.ref.count</name>
1916+
<value>-1</value>
1917+
<description>
1918+
Very large ref count on a file indicates
1919+
that it is a ref leak on that object. Such files
1920+
can not be removed even after it is invalidated
1921+
via compaction. Only way to recover in such
1922+
scenario is to reopen the region which can
1923+
release all resources, like the refcount, leases, etc.
1924+
This config represents Store files Ref Count threshold
1925+
value considered for reopening regions.
1926+
Any region with store files ref count > this value
1927+
would be eligible for reopening by master.
1928+
Default value -1 indicates this feature is turned off.
1929+
Only positive integer value should be provided to enable
1930+
this feature.
1931+
</description>
1932+
</property>
19041933
</configuration>

hbase-hadoop-compat/src/main/java/org/apache/hadoop/hbase/regionserver/MetricsRegionServerSource.java

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -233,6 +233,7 @@ public interface MetricsRegionServerSource extends BaseSource, JvmPauseMonitorSo
233233
String STOREFILE_COUNT_DESC = "Number of Store Files";
234234
String STORE_REF_COUNT = "storeRefCount";
235235
String STORE_REF_COUNT_DESC = "Store reference count";
236+
String MAX_STORE_FILE_REF_COUNT = "maxStoreFileRefCount";
236237
String MEMSTORE_SIZE = "memStoreSize";
237238
String MEMSTORE_SIZE_DESC = "Size of the memstore";
238239
String STOREFILE_SIZE = "storeFileSize";

hbase-hadoop-compat/src/main/java/org/apache/hadoop/hbase/regionserver/MetricsRegionWrapper.java

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -164,4 +164,10 @@ public interface MetricsRegionWrapper {
164164
* @return the number of references active on the store
165165
*/
166166
long getStoreRefCount();
167+
168+
/**
169+
* @return the max number of references active on any store file among
170+
* all store files that belong to this region
171+
*/
172+
long getMaxStoreFileRefCount();
167173
}

hbase-hadoop2-compat/src/main/java/org/apache/hadoop/hbase/regionserver/MetricsRegionSourceImpl.java

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -217,6 +217,10 @@ void snapshot(MetricsRecordBuilder mrb, boolean ignored) {
217217
regionNamePrefix + MetricsRegionServerSource.STORE_REF_COUNT,
218218
MetricsRegionServerSource.STORE_REF_COUNT),
219219
this.regionWrapper.getStoreRefCount());
220+
mrb.addGauge(Interns.info(
221+
regionNamePrefix + MetricsRegionServerSource.MAX_STORE_FILE_REF_COUNT,
222+
MetricsRegionServerSource.MAX_STORE_FILE_REF_COUNT),
223+
this.regionWrapper.getMaxStoreFileRefCount());
220224
mrb.addGauge(Interns.info(
221225
regionNamePrefix + MetricsRegionServerSource.MEMSTORE_SIZE,
222226
MetricsRegionServerSource.MEMSTORE_SIZE_DESC),

hbase-hadoop2-compat/src/test/java/org/apache/hadoop/hbase/regionserver/TestMetricsRegionSourceImpl.java

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -99,6 +99,11 @@ public long getStoreRefCount() {
9999
return 0;
100100
}
101101

102+
@Override
103+
public long getMaxStoreFileRefCount() {
104+
return 0;
105+
}
106+
102107
@Override
103108
public long getMemStoreSize() {
104109
return 0;

hbase-protocol-shaded/src/main/protobuf/ClusterStatus.proto

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -149,6 +149,12 @@ message RegionLoad {
149149

150150
/** the number of references active on the store */
151151
optional int32 store_ref_count = 21 [default = 0];
152+
153+
/**
154+
* The max number of references active on single store file among all store files
155+
* that belong to given region
156+
*/
157+
optional int32 max_store_file_ref_count = 22 [default = 0];
152158
}
153159

154160
/* Server-level protobufs */

0 commit comments

Comments
 (0)