Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
48 commits
Select commit Hold shift + click to select a range
03e670b
HDDS-7149. Update ratis version to 2.4.0 and thirdparty version to 1.…
captainzmc Oct 19, 2022
a6316c8
HDDS-7352. OM log flooded by AWSV4AuthValidator (#3857)
xBis7 Oct 19, 2022
237a9a1
HDDS-7058. EC: ReplicationManager - Implement ratis container replica…
sodonnel Oct 19, 2022
977ab59
HDDS-7341. EC: Close pipelines with unregistered nodes (#3850)
sodonnel Oct 19, 2022
d5dc65e
HDDS-7305. Fix Hadoop imports (#3822)
adoroszlai Oct 19, 2022
d32e96c
HDDS-7351. Use jackson-bom to ensure consistent Jackson version (#3856)
adoroszlai Oct 19, 2022
e45f9b8
HDDS-7199. Implement new mix workload Read/Write Freon command which …
DaveTeng0 Oct 19, 2022
ff6d15f
HDDS-7354. SchemaV3 blockData not deleted in table (#3860)
Xushaohong Oct 20, 2022
3fd7cd0
Revert "HDDS-7199. Implement new mix workload Read/Write Freon comman…
adoroszlai Oct 20, 2022
1fa6d02
HDDS-6930. SCM,OM,RECON should not print ERROR and exit with code 1 o…
navinko Oct 20, 2022
df48ca4
HDDS-7356. Update SCM-HA.zh.md to match the English version (#3861)
kaijchen Oct 20, 2022
31560fc
HDDS-7355. non-primordial scm fail to get signed cert from primordial…
Oct 20, 2022
f9b74a2
HDDS-6210. EC: Add EC metrics (#3851)
aswinshakil Oct 20, 2022
5c2a393
HDDS-7369. Fix wrong order of command arguments in Nonrolling-Upgrade…
zhtttylz Oct 20, 2022
13a6d01
HDDS-7141. Recon: Improve Disk Usage Page (#3789)
smitajoshi12 Oct 21, 2022
ae59f8a
HDDS-7248. Recon: Expand the container status page to show all unheal…
smitajoshi12 Oct 21, 2022
ecdfc20
HDDS-7199. Implement new mix workload Read/Write Freon command (#3872)
DaveTeng0 Oct 22, 2022
fdc57a9
HDDS-7403. README Security Improvement (#3879)
SaketaChalamchala Oct 24, 2022
df0d1e8
HDDS-7368. [Multi-Tenant] Add Volume Existence check in preExecute fo…
aswinshakil Oct 24, 2022
2f11175
HDDS-7284. JVM crash for rocksdb for read/write after close (#3801)
sumitagrawl Oct 25, 2022
74aef20
HDDS-7182. Add property to control RocksDB max open files (#3843)
ChenSammi Oct 25, 2022
b9a47f6
HDDS-7253. Fix exception when '/' in key name (#3774)
xichen01 Oct 25, 2022
340f3a7
HDDS-7381. Cleanup of VolumeManagerImpl (#3873)
myskov Oct 25, 2022
dfc13a0
HDDS-7258. Cleanup the allocated but uncommitted blocks (#3778)
Xushaohong Oct 25, 2022
5b7f448
HDDS-7121. Support namespace summaries (du, dist & counts) for legacy…
xBis7 Oct 25, 2022
462f32d
HDDS-7396. Force close non-RATIS containers in ReplicationManager (#3…
kaijchen Oct 26, 2022
561788e
Revert "HDDS-7253. Fix exception when '/' in key name (#3774)"
kaijchen Oct 26, 2022
9449747
HDDS-7413. Fix logging while marking container state unhealthy (#3887)
swamirishi Oct 26, 2022
a664cca
HDDS-7342. Move encryption-related code from MultipartCryptoKeyInputS…
Cyrill Oct 26, 2022
b5ecea6
HDDS-7349. Flaky integration test have memory leak for RatisDropwizar…
sumitagrawl Oct 26, 2022
0652ba4
HDDS-7422. Bump woodstox-core from 5.0.3 to 5.4.0 (#3886)
dependabot[bot] Oct 26, 2022
557d7f8
HDDS-7402. Adapt CommandQueue to track the count of each queued comma…
sodonnel Oct 27, 2022
3fb0dfb
HDDS-7407. EC: Block allocation should not be stripped across the EC …
kaijchen Oct 27, 2022
901bcf2
HDDS-7424. Bump jetty to 9.4.49.v20220914 (#3894)
Oct 27, 2022
a84bace
HDDS-7220. SCM should use sub-ca certificate for token signature with…
ChenSammi Oct 27, 2022
880f87c
HDDS-7370. Add pending commands in SCM to Datanode command count (#3867)
sodonnel Oct 27, 2022
f6cad7c
HDDS-7328. Improve Deletion of FSO Paths (#3844)
Xushaohong Oct 27, 2022
dd67faa
HDDS-7406. Remove unused the exception & improve debug log in KeyDele…
Xushaohong Oct 27, 2022
3a52215
HDDS-1157. TestOzoneContainerWithTLS is failing with SSLHandshakeExce…
ChenSammi Oct 27, 2022
f0d41c5
HDDS-7316. Print stacktrace to identify the location of RocksObject l…
sadanand48 Oct 27, 2022
bd8a161
HDDS-7421. Respect OZONE_LOGLEVEL and OZONE_ROOT_LOGGER for CLI comma…
adoroszlai Oct 27, 2022
30d1a4e
HDDS-7420. Bump Spring framework from 5.2.20 to 5.3.23 (#3902)
adoroszlai Oct 28, 2022
cc574c6
HDDS-7432. Move command summary into Commands object in CommandQueue …
sodonnel Oct 28, 2022
7394f2c
HDDS-7361. Add general metrics for queues in Datanode (#3863)
symious Oct 28, 2022
c187a8f
HDDS-7384. EC: ReplicationManager - implement deleting container hand…
Oct 28, 2022
3294d28
HDDS-7231. Integrate the GetKeyInfo API to key read flows (#3800)
duongkame Oct 28, 2022
2050ebb
HDDS-7419. Integrate the GetKeyInfo API to OFS
duongkame Oct 27, 2022
42aed99
Update integration test.
duongkame Oct 27, 2022
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ Ozone is a scalable, redundant, and distributed object store for Hadoop and Clou
* SCALABLE: Ozone is designed to scale to tens of billions of files and blocks and, in the future, even more.
* CONSISTENT: Ozone is a strongly consistent object store. This consistency is achieved by using protocols like RAFT.
* CLOUD-NATIVE: Ozone is designed to work well in containerized environments like YARN and Kubernetes.
* SECURE: Ozone integrates with Kerberos infrastructure for access control and supports TDE and on-wire encryption.
* SECURE: Ozone integrates with Kerberos infrastructure for authentication, supports native ACLs and integrates with Ranger for access control and supports TDE and on-wire encryption.
* HIGHLY AVAILABLE: Ozone is a fully replicated system that is designed to survive multiple failures.

## Documentation
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,8 @@ public class XceiverClientMetrics {

private @Metric MutableCounterLong pendingOps;
private @Metric MutableCounterLong totalOps;
private @Metric MutableCounterLong ecReconstructionTotal;
private @Metric MutableCounterLong ecReconstructionFailsTotal;
private MutableCounterLong[] pendingOpsArray;
private MutableCounterLong[] opsArray;
private MutableRate[] containerOpsLatency;
Expand Down Expand Up @@ -100,6 +102,14 @@ public long getPendingContainerOpCountMetrics(ContainerProtos.Type type) {
return pendingOpsArray[type.ordinal()].value();
}

public void incECReconstructionTotal() {
ecReconstructionTotal.incr();
}

public void incECReconstructionFailsTotal() {
ecReconstructionFailsTotal.incr();
}

@VisibleForTesting
public long getTotalOpCount() {
return totalOps.value();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@
import org.apache.hadoop.security.token.Token;

import com.google.common.annotations.VisibleForTesting;
import org.apache.ratis.thirdparty.io.grpc.Status;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

Expand Down Expand Up @@ -140,18 +141,22 @@ public synchronized void initialize() throws IOException {
IOException catchEx = null;
do {
try {
// If refresh returns new pipeline, retry with it.
// If we get IOException due to connectivity issue,
// retry according to retry policy.
chunks = getChunkInfos();
break;
// If we get a StorageContainerException or an IOException due to
// datanodes are not reachable, refresh to get the latest pipeline
// info and retry.
// Otherwise, just retry according to the retry policy.
} catch (SCMSecurityException ex) {
throw ex;
} catch (StorageContainerException ex) {
refreshPipeline(ex);
catchEx = ex;
} catch (IOException ex) {
LOG.debug("Retry to get chunk info fail", ex);
if (isConnectivityIssue(ex)) {
refreshPipeline(ex);
}
catchEx = ex;
}
} while (shouldRetryRead(catchEx));
Expand Down Expand Up @@ -187,19 +192,19 @@ public synchronized void initialize() throws IOException {
}
}

/**
* Check if this exception is because datanodes are not reachable.
*/
private boolean isConnectivityIssue(IOException ex) {
return Status.fromThrowable(ex).getCode() == Status.UNAVAILABLE.getCode();
}

private void refreshPipeline(IOException cause) throws IOException {
LOG.info("Unable to read information for block {} from pipeline {}: {}",
blockID, pipeline.getId(), cause.getMessage());
if (refreshPipelineFunction != null) {
LOG.debug("Re-fetching pipeline for block {}", blockID);
Pipeline newPipeline = refreshPipelineFunction.apply(blockID);
if (newPipeline == null || newPipeline.sameDatanodes(pipeline)) {
LOG.warn("No new pipeline for block {}", blockID);
throw cause;
} else {
LOG.debug("New pipeline got for block {}", blockID);
this.pipeline = newPipeline;
}
this.pipeline = refreshPipelineFunction.apply(blockID);
} else {
throw cause;
}
Expand Down Expand Up @@ -301,21 +306,27 @@ protected synchronized int readWithStrategy(ByteReaderStrategy strategy)
int numBytesRead;
try {
numBytesRead = strategy.readFromBlock(current, numBytesToRead);
retries = 0; // reset retries after successful read
retries = 0;
// If we get a StorageContainerException or an IOException due to
// datanodes are not reachable, refresh to get the latest pipeline
// info and retry.
// Otherwise, just retry according to the retry policy.
} catch (SCMSecurityException ex) {
throw ex;
} catch (StorageContainerException e) {
if (shouldRetryRead(e)) {
handleReadError(e);
continue;
} else {
throw e;
}
} catch (SCMSecurityException ex) {
throw ex;
} catch (IOException ex) {
// We got a IOException which might be due
// to DN down or connectivity issue.
if (shouldRetryRead(ex)) {
current.releaseClient();
if (isConnectivityIssue(ex)) {
handleReadError(ex);
} else {
current.releaseClient();
}
continue;
} else {
throw ex;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,6 @@
import org.apache.hadoop.hdds.protocol.datanode.proto.ContainerProtos.ReadChunkResponseProto;
import org.apache.hadoop.hdds.scm.XceiverClientFactory;
import org.apache.hadoop.hdds.scm.XceiverClientSpi;
import org.apache.hadoop.hdds.scm.container.common.helpers.StorageContainerException;
import org.apache.hadoop.hdds.scm.pipeline.Pipeline;
import org.apache.hadoop.ozone.common.Checksum;
import org.apache.hadoop.ozone.common.ChecksumData;
Expand Down Expand Up @@ -425,20 +424,12 @@ protected ByteBuffer[] readChunk(ChunkInfo readChunkInfo)
throws IOException {
ReadChunkResponseProto readChunkResponse;

try {
List<CheckedBiFunction> validators =
ContainerProtocolCalls.getValidatorList();
validators.add(validator);
List<CheckedBiFunction> validators =
ContainerProtocolCalls.getValidatorList();
validators.add(validator);

readChunkResponse = ContainerProtocolCalls.readChunk(xceiverClient,
readChunkInfo, blockID, validators, token);

} catch (IOException e) {
if (e instanceof StorageContainerException) {
throw e;
}
throw new IOException("Unexpected OzoneException: " + e.toString(), e);
}
readChunkResponse = ContainerProtocolCalls.readChunk(xceiverClient,
readChunkInfo, blockID, validators, token);

if (readChunkResponse.hasData()) {
return readChunkResponse.getData().asReadOnlyByteBufferList()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
import org.apache.hadoop.hdds.client.ECReplicationConfig;
import org.apache.hadoop.hdds.protocol.DatanodeDetails;
import org.apache.hadoop.hdds.scm.XceiverClientFactory;
import org.apache.hadoop.hdds.scm.XceiverClientManager;
import org.apache.hadoop.hdds.scm.pipeline.Pipeline;
import org.apache.hadoop.hdds.scm.storage.BlockExtendedInputStream;
import org.apache.hadoop.hdds.scm.storage.BlockLocationInfo;
Expand Down Expand Up @@ -117,6 +118,10 @@ private synchronized void setReaderType() {
}

private void createBlockReader() {
if (reconstructionReader) {
XceiverClientManager.getXceiverClientMetrics()
.incECReconstructionTotal();
}
blockReader = ecBlockInputStreamFactory.create(reconstructionReader,
failedLocations, repConfig, blockInfo, verifyChecksum,
xceiverClientFactory, refreshFunction);
Expand Down Expand Up @@ -162,6 +167,8 @@ public synchronized int read(ByteBuffer buf) throws IOException {
// If we get an error from the reconstruction reader, there
// is nothing left to try. It will re-try until it has insufficient
// locations internally, so if an error comes here, just re-throw it.
XceiverClientManager.getXceiverClientMetrics()
.incECReconstructionFailsTotal();
throw e;
}
if (e instanceof BadDataLocationException) {
Expand Down
Loading