Skip to content

Commit 98fe0d0

Browse files
HADOOP-17979. Add Interface EtagSource to allow FileStatus subclasses to provide etags (#3633)
Contributed by Steve Loughran
1 parent e8566b3 commit 98fe0d0

File tree

13 files changed

+629
-13
lines changed

13 files changed

+629
-13
lines changed

hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/CommonPathCapabilities.java

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -146,4 +146,22 @@ private CommonPathCapabilities() {
146146
*/
147147
public static final String ABORTABLE_STREAM =
148148
"fs.capability.outputstream.abortable";
149+
150+
/**
151+
* Does this FS support etags?
152+
* That is: will FileStatus entries from listing/getFileStatus
153+
* probes support EtagSource and return real values.
154+
*/
155+
public static final String ETAGS_AVAILABLE =
156+
"fs.capability.etags.available";
157+
158+
/**
159+
* Are etags guaranteed to be preserved across rename() operations..
160+
* FileSystems MUST NOT declare support for this feature
161+
* unless this holds.
162+
*/
163+
public static final String ETAGS_PRESERVED_IN_RENAME =
164+
"fs.capability.etags.preserved.in.rename";
165+
166+
149167
}
Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one
3+
* or more contributor license agreements. See the NOTICE file
4+
* distributed with this work for additional information
5+
* regarding copyright ownership. The ASF licenses this file
6+
* to you under the Apache License, Version 2.0 (the
7+
* "License"); you may not use this file except in compliance
8+
* with the License. You may obtain a copy of the License at
9+
*
10+
* http://www.apache.org/licenses/LICENSE-2.0
11+
*
12+
* Unless required by applicable law or agreed to in writing, software
13+
* distributed under the License is distributed on an "AS IS" BASIS,
14+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15+
* See the License for the specific language governing permissions and
16+
* limitations under the License.
17+
*/
18+
19+
package org.apache.hadoop.fs;
20+
21+
/**
22+
* An optional interface for {@link FileStatus} subclasses to implement
23+
* to provide access to etags.
24+
* If available FS SHOULD also implement the matching PathCapabilities
25+
* -- etag supported: {@link CommonPathCapabilities#ETAGS_AVAILABLE}.
26+
* -- etag consistent over rename:
27+
* {@link CommonPathCapabilities#ETAGS_PRESERVED_IN_RENAME}.
28+
*/
29+
public interface EtagSource {
30+
31+
/**
32+
* Return an etag of this file status.
33+
* A return value of null or "" means "no etag"
34+
* @return a possibly null or empty etag.
35+
*/
36+
String getEtag();
37+
38+
}

hadoop-common-project/hadoop-common/src/site/markdown/filesystem/filesystem.md

Lines changed: 90 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1240,7 +1240,7 @@ Renaming a file where the destination is a directory moves the file as a child
12401240
FS' where:
12411241
not exists(FS', src)
12421242
and exists(FS', dest)
1243-
and data(FS', dest) == data (FS, dest)
1243+
and data(FS', dest) == data (FS, source)
12441244
result = True
12451245

12461246

@@ -1698,3 +1698,92 @@ in:readahead | READAHEAD | CanSetReadahead | Set the readahead on the input st
16981698
dropbehind | DROPBEHIND | CanSetDropBehind | Drop the cache.
16991699
in:unbuffer | UNBUFFER | CanUnbuffer | Reduce the buffering on the input stream.
17001700

1701+
## <a name="etagsource"></a> Etag probes through the interface `EtagSource`
1702+
1703+
FileSystem implementations MAY support querying HTTP etags from `FileStatus`
1704+
entries. If so, the requirements are as follows
1705+
1706+
### Etag support MUST BE across all list/`getFileStatus()` calls.
1707+
1708+
That is: when adding etag support, all operations which return `FileStatus` or `ListLocatedStatus`
1709+
entries MUST return subclasses which are instances of `EtagSource`.
1710+
1711+
### FileStatus instances MUST have etags whenever the remote store provides them.
1712+
1713+
To support etags, they MUST BE to be provided in both `getFileStatus()`
1714+
and list calls.
1715+
1716+
Implementors note: the core APIs which MUST BE overridden to achieve this are as follows:
1717+
1718+
```java
1719+
FileStatus getFileStatus(Path)
1720+
FileStatus[] listStatus(Path)
1721+
RemoteIterator<FileStatus> listStatusIterator(Path)
1722+
RemoteIterator<LocatedFileStatus> listFiles([Path, boolean)
1723+
```
1724+
1725+
1726+
### Etags of files MUST BE Consistent across all list/getFileStatus operations.
1727+
1728+
The value of `EtagSource.getEtag()` MUST be the same for list* queries which return etags for calls of `getFileStatus()` for the specific object.
1729+
1730+
```java
1731+
((EtagSource)getFileStatus(path)).getEtag() == ((EtagSource)listStatus(path)[0]).getEtag()
1732+
```
1733+
1734+
Similarly, the same value MUST BE returned for `listFiles()`, `listStatusIncremental()` of the path and
1735+
when listing the parent path, of all files in the listing.
1736+
1737+
### Etags MUST BE different for different file contents.
1738+
1739+
Two different arrays of data written to the same path MUST have different etag values when probed.
1740+
This is a requirement of the HTTP specification.
1741+
1742+
### Etags of files SHOULD BE preserved across rename operations
1743+
1744+
After a file is renamed, the value of `((EtagSource)getFileStatus(dest)).getEtag()`
1745+
SHOULD be the same as the value of `((EtagSource)getFileStatus(source)).getEtag()`
1746+
was before the rename took place.
1747+
1748+
This is an implementation detail of the store; it does not hold for AWS S3.
1749+
1750+
If and only if the store consistently meets this requirement, the filesystem SHOULD
1751+
declare in `hasPathCapability()` that it supports
1752+
`fs.capability.etags.preserved.in.rename`
1753+
1754+
### Directories MAY have etags
1755+
1756+
Directory entries MAY return etags in listing/probe operations; these entries MAY be preserved across renames.
1757+
1758+
Equally, directory entries MAY NOT provide such entries, MAY NOT preserve them acrosss renames,
1759+
and MAY NOT guarantee consistency over time.
1760+
1761+
Note: special mention of the root path "/".
1762+
As that isn't a real "directory", nobody should expect it to have an etag.
1763+
1764+
### All etag-aware `FileStatus` subclass MUST BE `Serializable`; MAY BE `Writable`
1765+
1766+
The base `FileStatus` class implements `Serializable` and `Writable` and marshalls its fields appropriately.
1767+
1768+
Subclasses MUST support java serialization (Some Apache Spark applications use it), preserving the etag.
1769+
This is a matter of making the etag field non-static and adding a `serialVersionUID`.
1770+
1771+
The `Writable` support was used for marshalling status data over Hadoop IPC calls;
1772+
in Hadoop 3 that is implemented through `org/apache/hadoop/fs/protocolPB/PBHelper.java`and the methods deprecated.
1773+
Subclasses MAY override the deprecated methods to add etag marshalling.
1774+
However -but there is no expectation of this and such marshalling is unlikely to ever take place.
1775+
1776+
### Appropriate etag Path Capabilities SHOULD BE declared
1777+
1778+
1. `hasPathCapability(path, "fs.capability.etags.available")` MUST return true iff
1779+
the filesystem returns valid (non-empty etags) on file status/listing operations.
1780+
2. `hasPathCapability(path, "fs.capability.etags.consistent.across.rename")` MUST return
1781+
true if and only if etags are preserved across renames.
1782+
1783+
### Non-requirements of etag support
1784+
1785+
* There is no requirement/expectation that `FileSystem.getFileChecksum(Path)` returns
1786+
a checksum value related to the etag of an object, if any value is returned.
1787+
* If the same data is uploaded to the twice to the same or a different path,
1788+
the etag of the second upload MAY NOT match that of the first upload.
1789+
Lines changed: 194 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,194 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one
3+
* or more contributor license agreements. See the NOTICE file
4+
* distributed with this work for additional information
5+
* regarding copyright ownership. The ASF licenses this file
6+
* to you under the Apache License, Version 2.0 (the
7+
* "License"); you may not use this file except in compliance
8+
* with the License. You may obtain a copy of the License at
9+
*
10+
* http://www.apache.org/licenses/LICENSE-2.0
11+
*
12+
* Unless required by applicable law or agreed to in writing, software
13+
* distributed under the License is distributed on an "AS IS" BASIS,
14+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15+
* See the License for the specific language governing permissions and
16+
* limitations under the License.
17+
*/
18+
19+
package org.apache.hadoop.fs.contract;
20+
21+
import java.nio.charset.StandardCharsets;
22+
23+
import org.assertj.core.api.Assertions;
24+
import org.junit.Assume;
25+
import org.junit.Test;
26+
import org.slf4j.Logger;
27+
import org.slf4j.LoggerFactory;
28+
29+
import org.apache.hadoop.fs.EtagSource;
30+
import org.apache.hadoop.fs.FileStatus;
31+
import org.apache.hadoop.fs.FileSystem;
32+
import org.apache.hadoop.fs.LocatedFileStatus;
33+
import org.apache.hadoop.fs.Path;
34+
35+
import static org.apache.hadoop.fs.CommonPathCapabilities.ETAGS_AVAILABLE;
36+
import static org.apache.hadoop.fs.CommonPathCapabilities.ETAGS_PRESERVED_IN_RENAME;
37+
38+
/**
39+
* For filesystems which support etags, validate correctness
40+
* of their implementation.
41+
*/
42+
public abstract class AbstractContractEtagTest extends
43+
AbstractFSContractTestBase {
44+
45+
private static final Logger LOG =
46+
LoggerFactory.getLogger(AbstractContractEtagTest.class);
47+
48+
/**
49+
* basic consistency across operations, as well as being non-empty.
50+
*/
51+
@Test
52+
public void testEtagConsistencyAcrossListAndHead() throws Throwable {
53+
describe("Etag values must be non-empty and consistent across LIST and HEAD Calls.");
54+
final Path path = methodPath();
55+
final FileSystem fs = getFileSystem();
56+
57+
Assertions.assertThat(fs.hasPathCapability(path, ETAGS_AVAILABLE))
58+
.describedAs("path capability %s of %s",
59+
ETAGS_AVAILABLE, path)
60+
.isTrue();
61+
62+
ContractTestUtils.touch(fs, path);
63+
64+
final FileStatus st = fs.getFileStatus(path);
65+
final String etag = etagFromStatus(st);
66+
LOG.info("etag of empty file is \"{}\"", etag);
67+
68+
final FileStatus[] statuses = fs.listStatus(path);
69+
Assertions.assertThat(statuses)
70+
.describedAs("List(%s)", path)
71+
.hasSize(1);
72+
final FileStatus lsStatus = statuses[0];
73+
Assertions.assertThat(etagFromStatus(lsStatus))
74+
.describedAs("etag of list status (%s) compared to HEAD value of %s", lsStatus, st)
75+
.isEqualTo(etag);
76+
}
77+
78+
/**
79+
* Get an etag from a FileStatus which MUST BE
80+
* an implementation of EtagSource and
81+
* whose etag MUST NOT BE null/empty.
82+
* @param st the status
83+
* @return the etag
84+
*/
85+
String etagFromStatus(FileStatus st) {
86+
Assertions.assertThat(st)
87+
.describedAs("FileStatus %s", st)
88+
.isInstanceOf(EtagSource.class);
89+
final String etag = ((EtagSource) st).getEtag();
90+
Assertions.assertThat(etag)
91+
.describedAs("Etag of %s", st)
92+
.isNotBlank();
93+
return etag;
94+
}
95+
96+
/**
97+
* Overwritten data has different etags.
98+
*/
99+
@Test
100+
public void testEtagsOfDifferentDataDifferent() throws Throwable {
101+
describe("Verify that two different blocks of data written have different tags");
102+
103+
final Path path = methodPath();
104+
final FileSystem fs = getFileSystem();
105+
Path src = new Path(path, "src");
106+
107+
ContractTestUtils.createFile(fs, src, true,
108+
"data1234".getBytes(StandardCharsets.UTF_8));
109+
final FileStatus srcStatus = fs.getFileStatus(src);
110+
final String srcTag = etagFromStatus(srcStatus);
111+
LOG.info("etag of file 1 is \"{}\"", srcTag);
112+
113+
// now overwrite with data of same length
114+
// (ensure that path or length aren't used exclusively as tag)
115+
ContractTestUtils.createFile(fs, src, true,
116+
"1234data".getBytes(StandardCharsets.UTF_8));
117+
118+
// validate
119+
final String tag2 = etagFromStatus(fs.getFileStatus(src));
120+
LOG.info("etag of file 2 is \"{}\"", tag2);
121+
122+
Assertions.assertThat(tag2)
123+
.describedAs("etag of updated file")
124+
.isNotEqualTo(srcTag);
125+
}
126+
127+
/**
128+
* If supported, rename preserves etags.
129+
*/
130+
@Test
131+
public void testEtagConsistencyAcrossRename() throws Throwable {
132+
describe("Verify that when a file is renamed, the etag remains unchanged");
133+
final Path path = methodPath();
134+
final FileSystem fs = getFileSystem();
135+
Assume.assumeTrue(
136+
"Filesystem does not declare that etags are preserved across renames",
137+
fs.hasPathCapability(path, ETAGS_PRESERVED_IN_RENAME));
138+
Path src = new Path(path, "src");
139+
Path dest = new Path(path, "dest");
140+
141+
ContractTestUtils.createFile(fs, src, true,
142+
"sample data".getBytes(StandardCharsets.UTF_8));
143+
final FileStatus srcStatus = fs.getFileStatus(src);
144+
LOG.info("located file status string value " + srcStatus);
145+
146+
final String srcTag = etagFromStatus(srcStatus);
147+
LOG.info("etag of short file is \"{}\"", srcTag);
148+
149+
Assertions.assertThat(srcTag)
150+
.describedAs("Etag of %s", srcStatus)
151+
.isNotBlank();
152+
153+
// rename
154+
fs.rename(src, dest);
155+
156+
// validate
157+
FileStatus destStatus = fs.getFileStatus(dest);
158+
final String destTag = etagFromStatus(destStatus);
159+
Assertions.assertThat(destTag)
160+
.describedAs("etag of list status (%s) compared to HEAD value of %s",
161+
destStatus, srcStatus)
162+
.isEqualTo(srcTag);
163+
}
164+
165+
/**
166+
* For effective use of etags, listLocatedStatus SHOULD return status entries
167+
* with consistent values.
168+
* This ensures that listing during query planning can collect and use the etags.
169+
*/
170+
@Test
171+
public void testLocatedStatusAlsoHasEtag() throws Throwable {
172+
describe("verify that listLocatedStatus() and listFiles() are etag sources");
173+
final Path path = methodPath();
174+
final FileSystem fs = getFileSystem();
175+
Path src = new Path(path, "src");
176+
ContractTestUtils.createFile(fs, src, true,
177+
"sample data".getBytes(StandardCharsets.UTF_8));
178+
final FileStatus srcStatus = fs.getFileStatus(src);
179+
final String srcTag = etagFromStatus(srcStatus);
180+
final LocatedFileStatus entry = fs.listLocatedStatus(path).next();
181+
LOG.info("located file status string value " + entry);
182+
final String listTag = etagFromStatus(entry);
183+
Assertions.assertThat(listTag)
184+
.describedAs("etag of listLocatedStatus (%s) compared to HEAD value of %s",
185+
entry, srcStatus)
186+
.isEqualTo(srcTag);
187+
188+
final LocatedFileStatus entry2 = fs.listFiles(path, false).next();
189+
Assertions.assertThat(etagFromStatus(entry2))
190+
.describedAs("etag of listFiles (%s) compared to HEAD value of %s",
191+
entry, srcStatus)
192+
.isEqualTo(srcTag);
193+
}
194+
}

hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3AFileStatus.java

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919

2020
import org.apache.hadoop.classification.InterfaceAudience;
2121
import org.apache.hadoop.classification.InterfaceStability;
22+
import org.apache.hadoop.fs.EtagSource;
2223
import org.apache.hadoop.fs.FileStatus;
2324
import org.apache.hadoop.fs.Path;
2425

@@ -30,7 +31,7 @@
3031
*/
3132
@InterfaceAudience.Private
3233
@InterfaceStability.Evolving
33-
public class S3AFileStatus extends FileStatus {
34+
public class S3AFileStatus extends FileStatus implements EtagSource {
3435

3536
private static final long serialVersionUID = -5955674081978903922L;
3637

@@ -166,8 +167,16 @@ public void setIsEmptyDirectory(Tristate isEmptyDirectory) {
166167

167168
/**
168169
* @return the S3 object eTag when available, else null.
170+
* @deprecated use {@link EtagSource#getEtag()} for
171+
* public access.
169172
*/
173+
@Deprecated
170174
public String getETag() {
175+
return getEtag();
176+
}
177+
178+
@Override
179+
public String getEtag() {
171180
return eTag;
172181
}
173182

0 commit comments

Comments
 (0)