diff --git a/hadoop-hdds/client/src/main/java/org/apache/hadoop/hdds/scm/OzoneClientConfig.java b/hadoop-hdds/client/src/main/java/org/apache/hadoop/hdds/scm/OzoneClientConfig.java index 63dd5115962f..fd3ab4751f51 100644 --- a/hadoop-hdds/client/src/main/java/org/apache/hadoop/hdds/scm/OzoneClientConfig.java +++ b/hadoop-hdds/client/src/main/java/org/apache/hadoop/hdds/scm/OzoneClientConfig.java @@ -66,6 +66,37 @@ public enum ChecksumCombineMode { tags = ConfigTag.CLIENT) private int streamBufferSize = 4 * 1024 * 1024; + @Config(key = "datastream.buffer.flush.size", + defaultValue = "16MB", + type = ConfigType.SIZE, + description = "The boundary at which putBlock is executed", + tags = ConfigTag.CLIENT) + private long dataStreamBufferFlushSize = 16 * 1024 * 1024; + + @Config(key = "datastream.min.packet.size", + defaultValue = "1MB", + type = ConfigType.SIZE, + description = "The maximum size of the ByteBuffer " + + "(used via ratis streaming)", + tags = ConfigTag.CLIENT) + private int dataStreamMinPacketSize = 1024 * 1024; + + @Config(key = "datastream.window.size", + defaultValue = "64MB", + type = ConfigType.SIZE, + description = "Maximum size of BufferList(used for retry) size per " + + "BlockDataStreamOutput instance", + tags = ConfigTag.CLIENT) + private long streamWindowSize = 64 * 1024 * 1024; + + @Config(key = "datastream.pipeline.mode", + defaultValue = "true", + description = "Streaming write support both pipeline mode(datanode1->" + + "datanode2->datanode3) and star mode(datanode1->datanode2, " + + "datanode1->datanode3). By default we use pipeline mode.", + tags = ConfigTag.CLIENT) + private boolean datastreamPipelineMode = true; + @Config(key = "stream.buffer.increment", defaultValue = "0B", type = ConfigType.SIZE, @@ -236,6 +267,22 @@ public void setStreamBufferMaxSize(long streamBufferMaxSize) { this.streamBufferMaxSize = streamBufferMaxSize; } + public int getDataStreamMinPacketSize() { + return dataStreamMinPacketSize; + } + + public void setDataStreamMinPacketSize(int dataStreamMinPacketSize) { + this.dataStreamMinPacketSize = dataStreamMinPacketSize; + } + + public long getStreamWindowSize() { + return streamWindowSize; + } + + public void setStreamWindowSize(long streamWindowSize) { + this.streamWindowSize = streamWindowSize; + } + public int getMaxRetryCount() { return maxRetryCount; } @@ -288,6 +335,14 @@ public int getBufferIncrement() { return bufferIncrement; } + public long getDataStreamBufferFlushSize() { + return dataStreamBufferFlushSize; + } + + public void setDataStreamBufferFlushSize(long dataStreamBufferFlushSize) { + this.dataStreamBufferFlushSize = dataStreamBufferFlushSize; + } + public ChecksumCombineMode getChecksumCombineMode() { try { return ChecksumCombineMode.valueOf(checksumCombineMode); @@ -307,4 +362,12 @@ public void setEcReconstructStripeReadPoolLimit(int poolLimit) { public int getEcReconstructStripeReadPoolLimit() { return ecReconstructStripeReadPoolLimit; } + + public boolean isDatastreamPipelineMode() { + return datastreamPipelineMode; + } + + public void setDatastreamPipelineMode(boolean datastreamPipelineMode) { + this.datastreamPipelineMode = datastreamPipelineMode; + } } diff --git a/hadoop-hdds/client/src/main/java/org/apache/hadoop/hdds/scm/XceiverClientRatis.java b/hadoop-hdds/client/src/main/java/org/apache/hadoop/hdds/scm/XceiverClientRatis.java index d0fd0db12950..3ea269b08b08 100644 --- a/hadoop-hdds/client/src/main/java/org/apache/hadoop/hdds/scm/XceiverClientRatis.java +++ b/hadoop-hdds/client/src/main/java/org/apache/hadoop/hdds/scm/XceiverClientRatis.java @@ -53,6 +53,7 @@ import com.google.common.annotations.VisibleForTesting; import com.google.common.base.Preconditions; import org.apache.ratis.client.RaftClient; +import org.apache.ratis.client.api.DataStreamApi; import org.apache.ratis.grpc.GrpcTlsConfig; import org.apache.ratis.proto.RaftProtos; import org.apache.ratis.proto.RaftProtos.ReplicationLevel; @@ -135,7 +136,7 @@ private long updateCommitInfosMap(RaftClientReply reply) { .orElse(0L); } - private long updateCommitInfosMap( + public long updateCommitInfosMap( Collection commitInfoProtos) { // if the commitInfo map is empty, just update the commit indexes for each // of the servers @@ -382,4 +383,8 @@ public XceiverClientReply sendCommandAsync( throw new UnsupportedOperationException( "Operation Not supported for ratis client"); } + + public DataStreamApi getDataStreamApi() { + return this.getClient().getDataStreamApi(); + } } diff --git a/hadoop-hdds/client/src/main/java/org/apache/hadoop/hdds/scm/storage/AbstractDataStreamOutput.java b/hadoop-hdds/client/src/main/java/org/apache/hadoop/hdds/scm/storage/AbstractDataStreamOutput.java new file mode 100644 index 000000000000..cad1d0479249 --- /dev/null +++ b/hadoop-hdds/client/src/main/java/org/apache/hadoop/hdds/scm/storage/AbstractDataStreamOutput.java @@ -0,0 +1,131 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hdds.scm.storage; + +import com.google.common.annotations.VisibleForTesting; +import com.google.common.base.Preconditions; +import org.apache.hadoop.hdds.scm.client.HddsClientUtils; +import org.apache.hadoop.hdds.scm.container.common.helpers.StorageContainerException; +import org.apache.hadoop.io.retry.RetryPolicy; +import org.apache.ratis.protocol.exceptions.AlreadyClosedException; +import org.apache.ratis.protocol.exceptions.RaftRetryFailureException; + +import java.io.IOException; +import java.io.InterruptedIOException; +import java.util.Map; +import java.util.Objects; + +/** + * This class is used for error handling methods. + */ +public abstract class AbstractDataStreamOutput + implements ByteBufferStreamOutput { + + private final Map, RetryPolicy> retryPolicyMap; + private int retryCount; + private boolean isException; + + protected AbstractDataStreamOutput( + Map, RetryPolicy> retryPolicyMap) { + this.retryPolicyMap = retryPolicyMap; + this.isException = false; + this.retryCount = 0; + } + + @VisibleForTesting + public int getRetryCount() { + return retryCount; + } + + protected void resetRetryCount() { + retryCount = 0; + } + + protected boolean isException() { + return isException; + } + + /** + * Checks if the provided exception signifies retry failure in ratis client. + * In case of retry failure, ratis client throws RaftRetryFailureException + * and all succeeding operations are failed with AlreadyClosedException. + */ + protected boolean checkForRetryFailure(Throwable t) { + return t instanceof RaftRetryFailureException + || t instanceof AlreadyClosedException; + } + + // Every container specific exception from datatnode will be seen as + // StorageContainerException + protected boolean checkIfContainerToExclude(Throwable t) { + return t instanceof StorageContainerException; + } + + protected void setExceptionAndThrow(IOException ioe) throws IOException { + isException = true; + throw ioe; + } + + protected void handleRetry(IOException exception) throws IOException { + RetryPolicy retryPolicy = retryPolicyMap + .get(HddsClientUtils.checkForException(exception).getClass()); + if (retryPolicy == null) { + retryPolicy = retryPolicyMap.get(Exception.class); + } + handleRetry(exception, retryPolicy); + } + + protected void handleRetry(IOException exception, RetryPolicy retryPolicy) + throws IOException { + RetryPolicy.RetryAction action = null; + try { + action = retryPolicy.shouldRetry(exception, retryCount, 0, true); + } catch (Exception e) { + setExceptionAndThrow(new IOException(e)); + } + if (action != null && + action.action == RetryPolicy.RetryAction.RetryDecision.FAIL) { + String msg = ""; + if (action.reason != null) { + msg = "Retry request failed. " + action.reason; + } + setExceptionAndThrow(new IOException(msg, exception)); + } + + // Throw the exception if the thread is interrupted + if (Thread.currentThread().isInterrupted()) { + setExceptionAndThrow(exception); + } + Objects.requireNonNull(action); + Preconditions.checkArgument( + action.action == RetryPolicy.RetryAction.RetryDecision.RETRY); + if (action.delayMillis > 0) { + try { + Thread.sleep(action.delayMillis); + } catch (InterruptedException e) { + Thread.currentThread().interrupt(); + IOException ioe = (IOException) new InterruptedIOException( + "Interrupted: action=" + action + ", retry policy=" + retryPolicy) + .initCause(e); + setExceptionAndThrow(ioe); + } + } + retryCount++; + } +} diff --git a/hadoop-hdds/client/src/main/java/org/apache/hadoop/hdds/scm/storage/BlockDataStreamOutput.java b/hadoop-hdds/client/src/main/java/org/apache/hadoop/hdds/scm/storage/BlockDataStreamOutput.java new file mode 100644 index 000000000000..611ce809d757 --- /dev/null +++ b/hadoop-hdds/client/src/main/java/org/apache/hadoop/hdds/scm/storage/BlockDataStreamOutput.java @@ -0,0 +1,737 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hdds.scm.storage; + +import com.google.common.annotations.VisibleForTesting; +import com.google.common.base.Preconditions; +import org.apache.hadoop.hdds.client.BlockID; +import org.apache.hadoop.hdds.protocol.DatanodeDetails; +import org.apache.hadoop.hdds.protocol.datanode.proto.ContainerProtos; +import org.apache.hadoop.hdds.protocol.datanode.proto.ContainerProtos.BlockData; +import org.apache.hadoop.hdds.protocol.datanode.proto.ContainerProtos.ChunkInfo; +import org.apache.hadoop.hdds.protocol.datanode.proto.ContainerProtos.ContainerCommandRequestProto; +import org.apache.hadoop.hdds.protocol.datanode.proto.ContainerProtos.ContainerCommandResponseProto; +import org.apache.hadoop.hdds.protocol.datanode.proto.ContainerProtos.KeyValue; +import org.apache.hadoop.hdds.ratis.ContainerCommandRequestMessage; +import org.apache.hadoop.hdds.ratis.RatisHelper; +import org.apache.hadoop.hdds.scm.OzoneClientConfig; +import org.apache.hadoop.hdds.scm.XceiverClientFactory; +import org.apache.hadoop.hdds.scm.XceiverClientManager; +import org.apache.hadoop.hdds.scm.XceiverClientMetrics; +import org.apache.hadoop.hdds.scm.XceiverClientRatis; +import org.apache.hadoop.hdds.scm.XceiverClientReply; +import org.apache.hadoop.hdds.scm.container.common.helpers.StorageContainerException; +import org.apache.hadoop.hdds.scm.pipeline.Pipeline; +import org.apache.hadoop.ozone.common.Checksum; +import org.apache.hadoop.ozone.common.ChecksumData; +import org.apache.hadoop.ozone.common.OzoneChecksumException; +import org.apache.hadoop.security.token.Token; +import org.apache.hadoop.security.token.TokenIdentifier; +import org.apache.ratis.client.api.DataStreamOutput; +import org.apache.ratis.io.StandardWriteOption; +import org.apache.ratis.protocol.DataStreamReply; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.nio.ByteBuffer; +import java.util.ArrayList; +import java.util.LinkedList; +import java.util.List; +import java.util.Queue; +import java.util.concurrent.CompletableFuture; +import java.util.concurrent.CompletionException; +import java.util.concurrent.ExecutionException; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; +import java.util.concurrent.atomic.AtomicLong; +import java.util.concurrent.atomic.AtomicReference; + +import static org.apache.hadoop.hdds.scm.storage.ContainerProtocolCalls.putBlockAsync; + +/** + * An {@link ByteBufferStreamOutput} used by the REST service in combination + * with the SCMClient to write the value of a key to a sequence + * of container chunks. Writes are buffered locally and periodically written to + * the container as a new chunk. In order to preserve the semantics that + * replacement of a pre-existing key is atomic, each instance of the stream has + * an internal unique identifier. This unique identifier and a monotonically + * increasing chunk index form a composite key that is used as the chunk name. + * After all data is written, a putKey call creates or updates the corresponding + * container key, and this call includes the full list of chunks that make up + * the key data. The list of chunks is updated all at once. Therefore, a + * concurrent reader never can see an intermediate state in which different + * chunks of data from different versions of the key data are interleaved. + * This class encapsulates all state management for buffering and writing + * through to the container. + */ +public class BlockDataStreamOutput implements ByteBufferStreamOutput { + public static final Logger LOG = + LoggerFactory.getLogger(BlockDataStreamOutput.class); + + public static final int PUT_BLOCK_REQUEST_LENGTH_MAX = 1 << 20; // 1MB + + public static final String EXCEPTION_MSG = + "Unexpected Storage Container Exception: "; + private static final CompletableFuture[] EMPTY_FUTURE_ARRAY = {}; + + private AtomicReference blockID; + + private final BlockData.Builder containerBlockData; + private XceiverClientFactory xceiverClientFactory; + private XceiverClientRatis xceiverClient; + private OzoneClientConfig config; + + private int chunkIndex; + private final AtomicLong chunkOffset = new AtomicLong(); + + // Similar to 'BufferPool' but this list maintains only references + // to the ByteBuffers. + private List bufferList; + + // The IOException will be set by response handling thread in case there is an + // exception received in the response. If the exception is set, the next + // request will fail upfront. + private final AtomicReference ioException; + private final ExecutorService responseExecutor; + + // the effective length of data flushed so far + private long totalDataFlushedLength; + + // effective data write attempted so far for the block + private long writtenDataLength; + + // This object will maintain the commitIndexes and byteBufferList in order + // Also, corresponding to the logIndex, the corresponding list of buffers will + // be released from the buffer pool. + private final StreamCommitWatcher commitWatcher; + + private Queue> + putBlockFutures = new LinkedList<>(); + + private final List failedServers; + private final Checksum checksum; + + //number of buffers used before doing a flush/putBlock. + private int flushPeriod; + private final Token token; + private final DataStreamOutput out; + private CompletableFuture dataStreamCloseReply; + private List> futures = new ArrayList<>(); + private final long syncSize = 0; // TODO: disk sync is disabled for now + private long syncPosition = 0; + private StreamBuffer currentBuffer; + private XceiverClientMetrics metrics; + // buffers for which putBlock is yet to be executed + private List buffersForPutBlock; + private boolean isDatastreamPipelineMode; + /** + * Creates a new BlockDataStreamOutput. + * + * @param blockID block ID + * @param xceiverClientManager client manager that controls client + * @param pipeline pipeline where block will be written + */ + public BlockDataStreamOutput( + BlockID blockID, + XceiverClientFactory xceiverClientManager, + Pipeline pipeline, + OzoneClientConfig config, + Token token, + List bufferList + ) throws IOException { + this.xceiverClientFactory = xceiverClientManager; + this.config = config; + this.isDatastreamPipelineMode = config.isDatastreamPipelineMode(); + this.blockID = new AtomicReference<>(blockID); + KeyValue keyValue = + KeyValue.newBuilder().setKey("TYPE").setValue("KEY").build(); + this.containerBlockData = + BlockData.newBuilder().setBlockID(blockID.getDatanodeBlockIDProtobuf()) + .addMetadata(keyValue); + this.xceiverClient = + (XceiverClientRatis)xceiverClientManager.acquireClient(pipeline); + // Alternatively, stream setup can be delayed till the first chunk write. + this.out = setupStream(pipeline); + this.token = token; + this.bufferList = bufferList; + flushPeriod = (int) (config.getStreamBufferFlushSize() / config + .getStreamBufferSize()); + + Preconditions + .checkArgument( + (long) flushPeriod * config.getStreamBufferSize() == config + .getStreamBufferFlushSize()); + + // A single thread executor handle the responses of async requests + responseExecutor = Executors.newSingleThreadExecutor(); + commitWatcher = new StreamCommitWatcher(xceiverClient, bufferList); + totalDataFlushedLength = 0; + writtenDataLength = 0; + failedServers = new ArrayList<>(0); + ioException = new AtomicReference<>(null); + checksum = new Checksum(config.getChecksumType(), + config.getBytesPerChecksum()); + metrics = XceiverClientManager.getXceiverClientMetrics(); + } + + private DataStreamOutput setupStream(Pipeline pipeline) throws IOException { + // Execute a dummy WriteChunk request to get the path of the target file, + // but does NOT write any data to it. + ContainerProtos.WriteChunkRequestProto.Builder writeChunkRequest = + ContainerProtos.WriteChunkRequestProto.newBuilder() + .setBlockID(blockID.get().getDatanodeBlockIDProtobuf()); + + String id = xceiverClient.getPipeline().getFirstNode().getUuidString(); + ContainerProtos.ContainerCommandRequestProto.Builder builder = + ContainerProtos.ContainerCommandRequestProto.newBuilder() + .setCmdType(ContainerProtos.Type.StreamInit) + .setContainerID(blockID.get().getContainerID()) + .setDatanodeUuid(id).setWriteChunk(writeChunkRequest); + + if (token != null) { + builder.setEncodedToken(token.encodeToUrlString()); + } + + ContainerCommandRequestMessage message = + ContainerCommandRequestMessage.toMessage(builder.build(), null); + + if (isDatastreamPipelineMode) { + return Preconditions.checkNotNull(xceiverClient.getDataStreamApi()) + .stream(message.getContent().asReadOnlyByteBuffer(), + RatisHelper.getRoutingTable(pipeline)); + } else { + return Preconditions.checkNotNull(xceiverClient.getDataStreamApi()) + .stream(message.getContent().asReadOnlyByteBuffer()); + } + } + + public BlockID getBlockID() { + return blockID.get(); + } + + public long getWrittenDataLength() { + return writtenDataLength; + } + + public List getFailedServers() { + return failedServers; + } + + @VisibleForTesting + public XceiverClientRatis getXceiverClient() { + return xceiverClient; + } + + public IOException getIoException() { + return ioException.get(); + } + + @Override + public void write(ByteBuffer b, int off, int len) throws IOException { + checkOpen(); + if (b == null) { + throw new NullPointerException(); + } + if (len == 0) { + return; + } + while (len > 0) { + allocateNewBufferIfNeeded(); + int writeLen = Math.min(len, currentBuffer.length()); + final StreamBuffer buf = new StreamBuffer(b, off, writeLen); + currentBuffer.put(buf); + writeChunkIfNeeded(); + off += writeLen; + writtenDataLength += writeLen; + len -= writeLen; + doFlushIfNeeded(); + } + } + + private void writeChunkIfNeeded() throws IOException { + if (currentBuffer.length() == 0) { + writeChunk(currentBuffer); + currentBuffer = null; + } + } + + private void writeChunk(StreamBuffer sb) throws IOException { + bufferList.add(sb); + if (buffersForPutBlock == null) { + buffersForPutBlock = new ArrayList<>(); + } + buffersForPutBlock.add(sb); + ByteBuffer dup = sb.duplicate(); + dup.position(0); + dup.limit(sb.position()); + writeChunkToContainer(dup); + } + + private void allocateNewBufferIfNeeded() { + if (currentBuffer == null) { + currentBuffer = + StreamBuffer.allocate(config.getDataStreamMinPacketSize()); + } + } + + private void doFlushIfNeeded() throws IOException { + long boundary = config.getDataStreamBufferFlushSize() / config + .getDataStreamMinPacketSize(); + // streamWindow is the maximum number of buffers that + // are allowed to exist in the bufferList. If buffers in + // the list exceed this limit , client will till it gets + // one putBlockResponse (first index) . This is similar to + // the bufferFull condition in async write path. + long streamWindow = config.getStreamWindowSize() / config + .getDataStreamMinPacketSize(); + if (!bufferList.isEmpty() && bufferList.size() % boundary == 0 && + buffersForPutBlock != null && !buffersForPutBlock.isEmpty()) { + updateFlushLength(); + executePutBlock(false, false); + } + if (bufferList.size() == streamWindow) { + try { + checkOpen(); + if (!putBlockFutures.isEmpty()) { + putBlockFutures.remove().get(); + } + } catch (ExecutionException e) { + handleExecutionException(e); + } catch (InterruptedException ex) { + Thread.currentThread().interrupt(); + handleInterruptedException(ex, true); + } + watchForCommit(true); + } + } + + private void updateFlushLength() { + totalDataFlushedLength = writtenDataLength; + } + + @VisibleForTesting + public long getTotalDataFlushedLength() { + return totalDataFlushedLength; + } + /** + * Will be called on the retryPath in case closedContainerException/ + * TimeoutException. + * @param len length of data to write + * @throws IOException if error occurred + */ + + public void writeOnRetry(long len) throws IOException { + if (len == 0) { + return; + } + if (LOG.isDebugEnabled()) { + LOG.debug("Retrying write length {} for blockID {}", len, blockID); + } + int count = 0; + while (len > 0) { + final StreamBuffer buf = bufferList.get(count); + final long writeLen = Math.min(buf.position(), len); + if (buffersForPutBlock == null) { + buffersForPutBlock = new ArrayList<>(); + } + buffersForPutBlock.add(buf); + final ByteBuffer duplicated = buf.duplicate(); + duplicated.position(0); + duplicated.limit(buf.position()); + writeChunkToContainer(duplicated); + len -= writeLen; + count++; + writtenDataLength += writeLen; + } + + + } + + /** + * calls watchForCommit API of the Ratis Client. For Standalone client, + * it is a no op. + * @param bufferFull flag indicating whether bufferFull condition is hit or + * its called as part flush/close + * @return minimum commit index replicated to all nodes + * @throws IOException IOException in case watch gets timed out + */ + private void watchForCommit(boolean bufferFull) throws IOException { + checkOpen(); + try { + XceiverClientReply reply = bufferFull ? + commitWatcher.streamWatchOnFirstIndex() : + commitWatcher.streamWatchOnLastIndex(); + if (reply != null) { + List dnList = reply.getDatanodes(); + if (!dnList.isEmpty()) { + Pipeline pipe = xceiverClient.getPipeline(); + + LOG.warn("Failed to commit BlockId {} on {}. Failed nodes: {}", + blockID, pipe, dnList); + failedServers.addAll(dnList); + } + } + } catch (IOException ioe) { + setIoException(ioe); + throw getIoException(); + } + + } + + /** + * @param close whether putBlock is happening as part of closing the stream + * @param force true if no data was written since most recent putBlock and + * stream is being closed + */ + private void executePutBlock(boolean close, + boolean force) throws IOException { + checkOpen(); + long flushPos = totalDataFlushedLength; + final List byteBufferList; + if (!force) { + Preconditions.checkNotNull(bufferList); + byteBufferList = buffersForPutBlock; + buffersForPutBlock = null; + Preconditions.checkNotNull(byteBufferList); + } else { + byteBufferList = null; + } + waitFuturesComplete(); + final BlockData blockData = containerBlockData.build(); + if (close) { + final ContainerCommandRequestProto putBlockRequest + = ContainerProtocolCalls.getPutBlockRequest( + xceiverClient.getPipeline(), blockData, true, token); + dataStreamCloseReply = executePutBlockClose(putBlockRequest, + PUT_BLOCK_REQUEST_LENGTH_MAX, out); + dataStreamCloseReply.whenComplete((reply, e) -> { + if (e != null || reply == null || !reply.isSuccess()) { + LOG.warn("Failed executePutBlockClose, reply=" + reply, e); + try { + executePutBlock(true, false); + } catch (IOException ex) { + throw new CompletionException(ex); + } + } + }); + } + + try { + XceiverClientReply asyncReply = + putBlockAsync(xceiverClient, blockData, close, token); + final CompletableFuture flushFuture + = asyncReply.getResponse().thenApplyAsync(e -> { + try { + validateResponse(e); + } catch (IOException sce) { + throw new CompletionException(sce); + } + // if the ioException is not set, putBlock is successful + if (getIoException() == null && !force) { + BlockID responseBlockID = BlockID.getFromProtobuf( + e.getPutBlock().getCommittedBlockLength().getBlockID()); + Preconditions.checkState(blockID.get().getContainerBlockID() + .equals(responseBlockID.getContainerBlockID())); + // updates the bcsId of the block + blockID.set(responseBlockID); + if (LOG.isDebugEnabled()) { + LOG.debug("Adding index " + asyncReply.getLogIndex() + + " commitMap size " + + commitWatcher.getCommitInfoMapSize() + " flushLength " + + flushPos + " blockID " + blockID); + } + // for standalone protocol, logIndex will always be 0. + commitWatcher + .updateCommitInfoMap(asyncReply.getLogIndex(), + byteBufferList); + } + return e; + }, responseExecutor).exceptionally(e -> { + if (LOG.isDebugEnabled()) { + LOG.debug("putBlock failed for blockID {} with exception {}", + blockID, e.getLocalizedMessage()); + } + CompletionException ce = new CompletionException(e); + setIoException(ce); + throw ce; + }); + putBlockFutures.add(flushFuture); + } catch (IOException | ExecutionException e) { + throw new IOException(EXCEPTION_MSG + e.toString(), e); + } catch (InterruptedException ex) { + Thread.currentThread().interrupt(); + handleInterruptedException(ex, false); + } + } + + public static CompletableFuture executePutBlockClose( + ContainerCommandRequestProto putBlockRequest, int max, + DataStreamOutput out) { + final ByteBuffer putBlock = ContainerCommandRequestMessage.toMessage( + putBlockRequest, null).getContent().asReadOnlyByteBuffer(); + final ByteBuffer protoLength = getProtoLength(putBlock, max); + RatisHelper.debug(putBlock, "putBlock", LOG); + out.writeAsync(putBlock); + RatisHelper.debug(protoLength, "protoLength", LOG); + return out.writeAsync(protoLength, StandardWriteOption.CLOSE); + } + + public static ByteBuffer getProtoLength(ByteBuffer putBlock, int max) { + final int protoLength = putBlock.remaining(); + Preconditions.checkState(protoLength <= max, + "protoLength== %s > max = %s", protoLength, max); + final ByteBuffer buffer = ByteBuffer.allocate(4); + buffer.putInt(protoLength); + buffer.flip(); + LOG.debug("protoLength = {}", protoLength); + Preconditions.checkState(buffer.remaining() == 4); + return buffer.asReadOnlyBuffer(); + } + + @Override + public void flush() throws IOException { + if (xceiverClientFactory != null && xceiverClient != null + && !config.isStreamBufferFlushDelay()) { + waitFuturesComplete(); + } + } + + public void waitFuturesComplete() throws IOException { + try { + CompletableFuture.allOf(futures.toArray(EMPTY_FUTURE_ARRAY)).get(); + futures.clear(); + } catch (Exception e) { + LOG.warn("Failed to write all chunks through stream: " + e); + throw new IOException(e); + } + } + + /** + * @param close whether the flush is happening as part of closing the stream + */ + private void handleFlush(boolean close) + throws IOException, InterruptedException, ExecutionException { + checkOpen(); + // flush the last chunk data residing on the currentBuffer + if (totalDataFlushedLength < writtenDataLength) { + // This can be a partially filled chunk. Since we are flushing the buffer + // here, we just limit this buffer to the current position. So that next + // write will happen in new buffer + + if (currentBuffer != null) { + writeChunk(currentBuffer); + currentBuffer = null; + } + updateFlushLength(); + executePutBlock(close, false); + } else if (close) { + // forcing an "empty" putBlock if stream is being closed without new + // data since latest flush - we need to send the "EOF" flag + executePutBlock(true, true); + } + CompletableFuture.allOf(putBlockFutures.toArray(EMPTY_FUTURE_ARRAY)).get(); + watchForCommit(false); + // just check again if the exception is hit while waiting for the + // futures to ensure flush has indeed succeeded + + // irrespective of whether the commitIndex2flushedDataMap is empty + // or not, ensure there is no exception set + checkOpen(); + } + + @Override + public void close() throws IOException { + if (xceiverClientFactory != null && xceiverClient != null) { + try { + handleFlush(true); + dataStreamCloseReply.get(); + } catch (ExecutionException e) { + handleExecutionException(e); + } catch (InterruptedException ex) { + Thread.currentThread().interrupt(); + handleInterruptedException(ex, true); + } finally { + cleanup(false); + } + + } + } + + private void validateResponse( + ContainerProtos.ContainerCommandResponseProto responseProto) + throws IOException { + try { + // if the ioException is already set, it means a prev request has failed + // just throw the exception. The current operation will fail with the + // original error + IOException exception = getIoException(); + if (exception != null) { + throw exception; + } + ContainerProtocolCalls.validateContainerResponse(responseProto); + } catch (StorageContainerException sce) { + setIoException(sce); + throw sce; + } + } + + + private void setIoException(Throwable e) { + IOException ioe = getIoException(); + if (ioe == null) { + IOException exception = new IOException(EXCEPTION_MSG + e.toString(), e); + ioException.compareAndSet(null, exception); + } else { + LOG.debug("Previous request had already failed with " + ioe.toString() + + " so subsequent request also encounters" + + " Storage Container Exception ", e); + } + } + + public void cleanup(boolean invalidateClient) { + if (xceiverClientFactory != null) { + xceiverClientFactory.releaseClient(xceiverClient, invalidateClient); + } + xceiverClientFactory = null; + xceiverClient = null; + commitWatcher.cleanup(); + responseExecutor.shutdown(); + } + + /** + * Checks if the stream is open or exception has occurred. + * If not, throws an exception. + * + * @throws IOException if stream is closed + */ + private void checkOpen() throws IOException { + if (isClosed()) { + throw new IOException("BlockDataStreamOutput has been closed."); + } else if (getIoException() != null) { + throw getIoException(); + } + } + + public boolean isClosed() { + return xceiverClient == null; + } + + private boolean needSync(long position) { + if (syncSize > 0) { + // TODO: or position >= fileLength + if (position - syncPosition >= syncSize) { + syncPosition = position; + return true; + } + } + return false; + } + + /** + * Writes buffered data as a new chunk to the container and saves chunk + * information to be used later in putKey call. + * + * @param buf chunk data to write, from position to limit + * @throws IOException if there is an I/O error while performing the call + * @throws OzoneChecksumException if there is an error while computing + * checksum + */ + private void writeChunkToContainer(ByteBuffer buf) + throws IOException { + final int effectiveChunkSize = buf.remaining(); + final long offset = chunkOffset.getAndAdd(effectiveChunkSize); + ChecksumData checksumData = checksum.computeChecksum( + buf.asReadOnlyBuffer()); + ChunkInfo chunkInfo = ChunkInfo.newBuilder() + .setChunkName(blockID.get().getLocalID() + "_chunk_" + ++chunkIndex) + .setOffset(offset) + .setLen(effectiveChunkSize) + .setChecksumData(checksumData.getProtoBufMessage()) + .build(); + metrics.incrPendingContainerOpsMetrics(ContainerProtos.Type.WriteChunk); + + if (LOG.isDebugEnabled()) { + LOG.debug("Writing chunk {} length {} at offset {}", + chunkInfo.getChunkName(), effectiveChunkSize, offset); + } + + CompletableFuture future = + (needSync(offset + effectiveChunkSize) ? + out.writeAsync(buf, StandardWriteOption.SYNC) : + out.writeAsync(buf)) + .whenCompleteAsync((r, e) -> { + if (e != null || !r.isSuccess()) { + if (e == null) { + e = new IOException("result is not success"); + } + String msg = + "Failed to write chunk " + chunkInfo.getChunkName() + + " " + "into block " + blockID; + LOG.debug("{}, exception: {}", msg, e.getLocalizedMessage()); + CompletionException ce = new CompletionException(msg, e); + setIoException(ce); + throw ce; + } else if (r.isSuccess()) { + xceiverClient.updateCommitInfosMap(r.getCommitInfos()); + } + }, responseExecutor); + + futures.add(future); + containerBlockData.addChunks(chunkInfo); + } + + @VisibleForTesting + public void setXceiverClient(XceiverClientRatis xceiverClient) { + this.xceiverClient = xceiverClient; + } + + /** + * Handles InterruptedExecution. + * + * @param ex + * @param processExecutionException is optional, if passed as TRUE, then + * handle ExecutionException else skip it. + * @throws IOException + */ + private void handleInterruptedException(Exception ex, + boolean processExecutionException) + throws IOException { + LOG.error("Command execution was interrupted."); + if (processExecutionException) { + handleExecutionException(ex); + } else { + throw new IOException(EXCEPTION_MSG + ex.toString(), ex); + } + } + + /** + * Handles ExecutionException by adjusting buffers. + * @param ex + * @throws IOException + */ + private void handleExecutionException(Exception ex) throws IOException { + setIoException(ex); + throw getIoException(); + } + + public long getTotalAckDataLength() { + return commitWatcher.getTotalAckDataLength(); + } +} diff --git a/hadoop-hdds/client/src/main/java/org/apache/hadoop/hdds/scm/storage/ByteBufferStreamOutput.java b/hadoop-hdds/client/src/main/java/org/apache/hadoop/hdds/scm/storage/ByteBufferStreamOutput.java new file mode 100644 index 000000000000..0650a685b634 --- /dev/null +++ b/hadoop-hdds/client/src/main/java/org/apache/hadoop/hdds/scm/storage/ByteBufferStreamOutput.java @@ -0,0 +1,57 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hdds.scm.storage; + +import java.io.Closeable; +import java.io.IOException; +import java.nio.ByteBuffer; + +/** +* This interface is for writing an output stream of ByteBuffers. +* An ByteBufferStreamOutput accepts nio ByteBuffer and sends them to some sink. +*/ +public interface ByteBufferStreamOutput extends Closeable { + /** + * Try to write all the bytes in ByteBuf b to DataStream. + * + * @param b the data. + * @exception IOException if an I/O error occurs. + */ + default void write(ByteBuffer b) throws IOException { + write(b, b.position(), b.remaining()); + } + + /** + * Try to write the [off:off + len) slice in ByteBuf b to DataStream. + * + * @param b the data. + * @param off the start offset in the data. + * @param len the number of bytes to write. + * @exception IOException if an I/O error occurs. + */ + void write(ByteBuffer b, int off, int len) throws IOException; + + /** + * Flushes this DataStream output and forces any buffered output bytes + * to be written out. + * + * @exception IOException if an I/O error occurs. + */ + void flush() throws IOException; +} diff --git a/hadoop-hdds/client/src/main/java/org/apache/hadoop/hdds/scm/storage/StreamBuffer.java b/hadoop-hdds/client/src/main/java/org/apache/hadoop/hdds/scm/storage/StreamBuffer.java new file mode 100644 index 000000000000..d34e4dca9483 --- /dev/null +++ b/hadoop-hdds/client/src/main/java/org/apache/hadoop/hdds/scm/storage/StreamBuffer.java @@ -0,0 +1,59 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hdds.scm.storage; + +import java.nio.ByteBuffer; + +/** + * Used for streaming write. + */ +public class StreamBuffer { + private final ByteBuffer buffer; + + public StreamBuffer(ByteBuffer buffer) { + this.buffer = buffer; + } + + public StreamBuffer(ByteBuffer buffer, int offset, int length) { + this((ByteBuffer) buffer.asReadOnlyBuffer().position(offset) + .limit(offset + length)); + } + + public ByteBuffer duplicate() { + return buffer.duplicate(); + } + + public int length() { + return buffer.limit() - buffer.position(); + } + + public int position() { + return buffer.position(); + } + + + public void put(StreamBuffer sb) { + buffer.put(sb.buffer); + } + + public static StreamBuffer allocate(int size) { + return new StreamBuffer(ByteBuffer.allocate(size)); + } + +} \ No newline at end of file diff --git a/hadoop-hdds/client/src/main/java/org/apache/hadoop/hdds/scm/storage/StreamCommitWatcher.java b/hadoop-hdds/client/src/main/java/org/apache/hadoop/hdds/scm/storage/StreamCommitWatcher.java new file mode 100644 index 000000000000..8ca70de81684 --- /dev/null +++ b/hadoop-hdds/client/src/main/java/org/apache/hadoop/hdds/scm/storage/StreamCommitWatcher.java @@ -0,0 +1,208 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hdds.scm.storage; + +import com.google.common.base.Preconditions; +import org.apache.hadoop.hdds.scm.XceiverClientReply; +import org.apache.hadoop.hdds.scm.XceiverClientSpi; +import org.apache.ratis.util.JavaUtils; +import org.apache.ratis.util.MemoizedSupplier; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.util.LinkedList; +import java.util.List; +import java.util.Map; +import java.util.concurrent.CompletableFuture; +import java.util.concurrent.ConcurrentHashMap; +import java.util.concurrent.ConcurrentMap; +import java.util.concurrent.ConcurrentSkipListMap; +import java.util.concurrent.ExecutionException; +import java.util.concurrent.TimeoutException; +import java.util.stream.Collectors; + +/** + * This class executes watchForCommit on ratis pipeline and releases + * buffers once data successfully gets replicated. + */ +public class StreamCommitWatcher { + + private static final Logger LOG = + LoggerFactory.getLogger(StreamCommitWatcher.class); + + private Map> commitIndexMap; + private final List bufferList; + + // total data which has been successfully flushed and acknowledged + // by all servers + private long totalAckDataLength; + private final ConcurrentMap> + replies = new ConcurrentHashMap<>(); + + private final XceiverClientSpi xceiverClient; + + public StreamCommitWatcher(XceiverClientSpi xceiverClient, + List bufferList) { + this.xceiverClient = xceiverClient; + commitIndexMap = new ConcurrentSkipListMap<>(); + this.bufferList = bufferList; + totalAckDataLength = 0; + } + + public void updateCommitInfoMap(long index, List buffers) { + commitIndexMap.computeIfAbsent(index, k -> new LinkedList<>()) + .addAll(buffers); + } + + int getCommitInfoMapSize() { + return commitIndexMap.size(); + } + + /** + * Calls watch for commit for the first index in commitIndex2flushedDataMap to + * the Ratis client. + * @return {@link XceiverClientReply} reply from raft client + * @throws IOException in case watchForCommit fails + */ + public XceiverClientReply streamWatchOnFirstIndex() throws IOException { + if (!commitIndexMap.isEmpty()) { + // wait for the first commit index in the commitIndex2flushedDataMap + // to get committed to all or majority of nodes in case timeout + // happens. + long index = + commitIndexMap.keySet().stream().mapToLong(v -> v).min() + .getAsLong(); + if (LOG.isDebugEnabled()) { + LOG.debug("waiting for first index {} to catch up", index); + } + return streamWatchForCommit(index); + } else { + return null; + } + } + + /** + * Calls watch for commit for the last index in commitIndex2flushedDataMap to + * the Ratis client. + * @return {@link XceiverClientReply} reply from raft client + * @throws IOException in case watchForCommit fails + */ + public XceiverClientReply streamWatchOnLastIndex() + throws IOException { + if (!commitIndexMap.isEmpty()) { + // wait for the commit index in the commitIndex2flushedDataMap + // to get committed to all or majority of nodes in case timeout + // happens. + long index = + commitIndexMap.keySet().stream().mapToLong(v -> v).max() + .getAsLong(); + if (LOG.isDebugEnabled()) { + LOG.debug("waiting for last flush Index {} to catch up", index); + } + return streamWatchForCommit(index); + } else { + return null; + } + } + + /** + * calls watchForCommit API of the Ratis Client. This method is for streaming + * and no longer requires releaseBuffers + * @param commitIndex log index to watch for + * @return minimum commit index replicated to all nodes + * @throws IOException IOException in case watch gets timed out + */ + public XceiverClientReply streamWatchForCommit(long commitIndex) + throws IOException { + final MemoizedSupplier> supplier + = JavaUtils.memoize(CompletableFuture::new); + final CompletableFuture f = replies.compute(commitIndex, + (key, value) -> value != null ? value : supplier.get()); + if (!supplier.isInitialized()) { + // future already exists + return f.join(); + } + + try { + XceiverClientReply reply = + xceiverClient.watchForCommit(commitIndex); + f.complete(reply); + final CompletableFuture removed + = replies.remove(commitIndex); + Preconditions.checkState(removed == f); + + adjustBuffers(reply.getLogIndex()); + return reply; + } catch (InterruptedException e) { + // Re-interrupt the thread while catching InterruptedException + Thread.currentThread().interrupt(); + throw getIOExceptionForWatchForCommit(commitIndex, e); + } catch (TimeoutException | ExecutionException e) { + throw getIOExceptionForWatchForCommit(commitIndex, e); + } + } + + void releaseBuffersOnException() { + adjustBuffers(xceiverClient.getReplicatedMinCommitIndex()); + } + + private void adjustBuffers(long commitIndex) { + List keyList = commitIndexMap.keySet().stream() + .filter(p -> p <= commitIndex).collect(Collectors.toList()); + if (!keyList.isEmpty()) { + releaseBuffers(keyList); + } + } + + private long releaseBuffers(List indexes) { + Preconditions.checkArgument(!commitIndexMap.isEmpty()); + for (long index : indexes) { + Preconditions.checkState(commitIndexMap.containsKey(index)); + final List buffers = commitIndexMap.remove(index); + final long length = + buffers.stream().mapToLong(StreamBuffer::position).sum(); + totalAckDataLength += length; + for (StreamBuffer byteBuffer : buffers) { + bufferList.remove(byteBuffer); + } + } + return totalAckDataLength; + } + + public long getTotalAckDataLength() { + return totalAckDataLength; + } + + private IOException getIOExceptionForWatchForCommit(long commitIndex, + Exception e) { + LOG.warn("watchForCommit failed for index {}", commitIndex, e); + IOException ioException = new IOException( + "Unexpected Storage Container Exception: " + e.toString(), e); + releaseBuffersOnException(); + return ioException; + } + + public void cleanup() { + if (commitIndexMap != null) { + commitIndexMap.clear(); + } + commitIndexMap = null; + } +} diff --git a/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/protocol/DatanodeDetails.java b/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/protocol/DatanodeDetails.java index 25826f3e23d7..78a0eeb7c5ee 100644 --- a/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/protocol/DatanodeDetails.java +++ b/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/protocol/DatanodeDetails.java @@ -273,8 +273,10 @@ public synchronized Port getPort(Port.Name name) { return port; } } - // if no separate admin/server port, return single Ratis one for compat - if (name == Name.RATIS_ADMIN || name == Name.RATIS_SERVER) { + // if no separate admin/server/datastream port, return single Ratis one for + // compat + if (name == Name.RATIS_ADMIN || name == Name.RATIS_SERVER || + name == Name.RATIS_DATASTREAM) { return getPort(Name.RATIS); } return null; @@ -784,7 +786,8 @@ public static final class Port { * Ports that are supported in DataNode. */ public enum Name { - STANDALONE, RATIS, REST, REPLICATION, RATIS_ADMIN, RATIS_SERVER; + STANDALONE, RATIS, REST, REPLICATION, RATIS_ADMIN, RATIS_SERVER, + RATIS_DATASTREAM; public static final Set ALL_PORTS = ImmutableSet.copyOf( Name.values()); diff --git a/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/ratis/RatisHelper.java b/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/ratis/RatisHelper.java index 67a3ac14a4c1..4f9844011bc8 100644 --- a/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/ratis/RatisHelper.java +++ b/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/ratis/RatisHelper.java @@ -19,6 +19,7 @@ package org.apache.hadoop.hdds.ratis; import java.io.IOException; +import java.nio.ByteBuffer; import java.security.cert.X509Certificate; import java.util.ArrayList; import java.util.Collection; @@ -44,17 +45,21 @@ import org.apache.ratis.client.RaftClientConfigKeys; import org.apache.ratis.conf.Parameters; import org.apache.ratis.conf.RaftProperties; +import org.apache.ratis.datastream.SupportedDataStreamType; import org.apache.ratis.grpc.GrpcConfigKeys; import org.apache.ratis.grpc.GrpcTlsConfig; +import org.apache.ratis.netty.NettyConfigKeys; import org.apache.ratis.proto.RaftProtos; import org.apache.ratis.protocol.RaftGroup; import org.apache.ratis.protocol.RaftGroupId; import org.apache.ratis.protocol.RaftPeer; import org.apache.ratis.protocol.RaftPeerId; +import org.apache.ratis.protocol.RoutingTable; import org.apache.ratis.retry.RetryPolicy; import org.apache.ratis.rpc.RpcType; import org.apache.ratis.rpc.SupportedRpcType; import org.apache.ratis.thirdparty.com.google.protobuf.ByteString; +import org.apache.ratis.thirdparty.io.netty.buffer.ByteBuf; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -119,7 +124,9 @@ private static RaftPeer.Builder raftPeerBuilderFor(DatanodeDetails dn) { .setId(toRaftPeerId(dn)) .setAddress(toRaftPeerAddress(dn, Port.Name.RATIS_SERVER)) .setAdminAddress(toRaftPeerAddress(dn, Port.Name.RATIS_ADMIN)) - .setClientAddress(toRaftPeerAddress(dn, Port.Name.RATIS)); + .setClientAddress(toRaftPeerAddress(dn, Port.Name.RATIS)) + .setDataStreamAddress( + toRaftPeerAddress(dn, Port.Name.RATIS_DATASTREAM)); } private static List toRaftPeers(Pipeline pipeline) { @@ -173,6 +180,7 @@ public static RaftClient newRaftClient(RpcType rpcType, Pipeline pipeline, ConfigurationSource ozoneConfiguration) throws IOException { return newRaftClient(rpcType, toRaftPeerId(pipeline.getLeaderNode()), + toRaftPeer(pipeline.getFirstNode()), newRaftGroup(RaftGroupId.valueOf(pipeline.getId().getId()), pipeline.getNodes()), retryPolicy, tlsConfig, ozoneConfiguration); } @@ -192,7 +200,7 @@ public static BiFunction newRaftClient( public static RaftClient newRaftClient(RpcType rpcType, RaftPeer leader, RetryPolicy retryPolicy, GrpcTlsConfig tlsConfig, ConfigurationSource configuration) { - return newRaftClient(rpcType, leader.getId(), + return newRaftClient(rpcType, leader.getId(), leader, newRaftGroup(Collections.singletonList(leader)), retryPolicy, tlsConfig, configuration); } @@ -200,14 +208,14 @@ public static RaftClient newRaftClient(RpcType rpcType, RaftPeer leader, public static RaftClient newRaftClient(RpcType rpcType, RaftPeer leader, RetryPolicy retryPolicy, ConfigurationSource ozoneConfiguration) { - return newRaftClient(rpcType, leader.getId(), + return newRaftClient(rpcType, leader.getId(), leader, newRaftGroup(Collections.singletonList(leader)), retryPolicy, null, ozoneConfiguration); } @SuppressWarnings("checkstyle:ParameterNumber") private static RaftClient newRaftClient(RpcType rpcType, RaftPeerId leader, - RaftGroup group, RetryPolicy retryPolicy, + RaftPeer primary, RaftGroup group, RetryPolicy retryPolicy, GrpcTlsConfig tlsConfig, ConfigurationSource ozoneConfiguration) { if (LOG.isTraceEnabled()) { LOG.trace("newRaftClient: {}, leader={}, group={}", @@ -221,6 +229,7 @@ private static RaftClient newRaftClient(RpcType rpcType, RaftPeerId leader, return RaftClient.newBuilder() .setRaftGroup(group) .setLeaderId(leader) + .setPrimaryDataStreamServer(primary) .setProperties(properties) .setParameters(setClientTlsConf(rpcType, tlsConfig)) .setRetryPolicy(retryPolicy) @@ -250,6 +259,7 @@ private static void setClientTlsConf(Parameters parameters, GrpcTlsConfig tlsConfig) { if (tlsConfig != null) { GrpcConfigKeys.Client.setTlsConf(parameters, tlsConfig); + NettyConfigKeys.DataStream.Client.setTlsConf(parameters, tlsConfig); } } @@ -260,6 +270,8 @@ public static Parameters setServerTlsConf( GrpcConfigKeys.Server.setTlsConf(parameters, serverConf); GrpcConfigKeys.TLS.setConf(parameters, serverConf); setAdminTlsConf(parameters, serverConf); + + NettyConfigKeys.DataStream.Server.setTlsConf(parameters, serverConf); } setClientTlsConf(parameters, clientConf); return parameters; @@ -278,6 +290,8 @@ public static RaftProperties newRaftProperties(RpcType rpcType) { public static RaftProperties setRpcType(RaftProperties properties, RpcType rpcType) { RaftConfigKeys.Rpc.setType(properties, rpcType); + RaftConfigKeys.DataStream.setType(properties, + SupportedDataStreamType.NETTY); return properties; } @@ -295,7 +309,8 @@ public static void createRaftClientProperties(ConfigurationSource ozoneConf, Map ratisClientConf = getDatanodeRatisPrefixProps(ozoneConf); ratisClientConf.forEach((key, val) -> { - if (isClientConfig(key) || isGrpcClientConfig(key)) { + if (isClientConfig(key) || isGrpcClientConfig(key) + || isNettyStreamConfig(key)) { raftProperties.set(key, val); } }); @@ -311,6 +326,15 @@ private static boolean isGrpcClientConfig(String key) { !key.startsWith(GrpcConfigKeys.Admin.PREFIX) && !key.startsWith(GrpcConfigKeys.Server.PREFIX); } + + private static boolean isNettyStreamConfig(String key) { + return key.startsWith(NettyConfigKeys.DataStream.PREFIX); + } + + private static boolean isStreamClientConfig(String key) { + return key.startsWith(RaftClientConfigKeys.DataStream.PREFIX); + } + /** * Set all server properties matching with prefix * {@link RatisHelper#HDDS_DATANODE_RATIS_PREFIX_KEY} in @@ -325,7 +349,8 @@ public static void createRaftServerProperties(ConfigurationSource ozoneConf, getDatanodeRatisPrefixProps(ozoneConf); ratisServerConf.forEach((key, val) -> { // Exclude ratis client configuration. - if (!isClientConfig(key)) { + if (isNettyStreamConfig(key) || isStreamClientConfig(key) || + !isClientConfig(key)) { raftProperties.set(key, val); } }); @@ -369,6 +394,37 @@ public static Long getMinReplicatedIndex( .min(Long::compareTo).orElse(null); } + public static RoutingTable getRoutingTable(Pipeline pipeline) { + RaftPeerId primaryId = null; + List raftPeers = new ArrayList<>(); + + for (DatanodeDetails dn : pipeline.getNodes()) { + final RaftPeerId raftPeerId = RaftPeerId.valueOf(dn.getUuidString()); + try { + if (dn == pipeline.getFirstNode()) { + primaryId = raftPeerId; + } + } catch (IOException e) { + LOG.error("Can not get FirstNode from the pipeline: {} with " + + "exception: {}", pipeline.toString(), e.getLocalizedMessage()); + return null; + } + raftPeers.add(raftPeerId); + } + + RoutingTable.Builder builder = RoutingTable.newBuilder(); + RaftPeerId previousId = primaryId; + for (RaftPeerId peerId : raftPeers) { + if (peerId.equals(primaryId)) { + continue; + } + builder.addSuccessor(previousId, peerId); + previousId = peerId; + } + + return builder.build(); + } + private static Class getClass(String name, Class xface) { try { @@ -382,4 +438,28 @@ private static Class getClass(String name, throw new RuntimeException(e); } } + + public static void debug(ByteBuffer buffer, String name, Logger log) { + if (!log.isDebugEnabled()) { + return; + } + buffer = buffer.duplicate(); + final StringBuilder builder = new StringBuilder(); + for (int i = 1; buffer.remaining() > 0; i++) { + builder.append(buffer.get()).append(i % 20 == 0 ? "\n " : ", "); + } + log.debug("{}: {}\n {}", name, buffer, builder); + } + + public static void debug(ByteBuf buf, String name, Logger log) { + if (!log.isDebugEnabled()) { + return; + } + buf = buf.duplicate(); + final StringBuilder builder = new StringBuilder(); + for (int i = 1; buf.readableBytes() > 0; i++) { + builder.append(buf.readByte()).append(i % 20 == 0 ? "\n " : ", "); + } + log.debug("{}: {}\n {}", name, buf, builder); + } } diff --git a/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/scm/storage/ContainerProtocolCalls.java b/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/scm/storage/ContainerProtocolCalls.java index b5365820e3d8..25d06fd18a67 100644 --- a/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/scm/storage/ContainerProtocolCalls.java +++ b/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/scm/storage/ContainerProtocolCalls.java @@ -56,6 +56,7 @@ import org.apache.hadoop.hdds.scm.container.common.helpers.BlockNotCommittedException; import org.apache.hadoop.hdds.scm.container.common.helpers.ContainerNotOpenException; import org.apache.hadoop.hdds.scm.container.common.helpers.StorageContainerException; +import org.apache.hadoop.hdds.scm.pipeline.Pipeline; import org.apache.hadoop.hdds.security.token.OzoneBlockTokenIdentifier; import org.apache.hadoop.ozone.common.Checksum; import org.apache.hadoop.ozone.common.ChecksumData; @@ -233,11 +234,19 @@ public static XceiverClientReply putBlockAsync( XceiverClientSpi xceiverClient, BlockData containerBlockData, boolean eof, Token token) throws IOException, InterruptedException, ExecutionException { + final ContainerCommandRequestProto request = getPutBlockRequest( + xceiverClient.getPipeline(), containerBlockData, eof, token); + return xceiverClient.sendCommandAsync(request); + } + + public static ContainerCommandRequestProto getPutBlockRequest( + Pipeline pipeline, BlockData containerBlockData, boolean eof, + Token token) throws IOException { PutBlockRequestProto.Builder createBlockRequest = PutBlockRequestProto.newBuilder() .setBlockData(containerBlockData) .setEof(eof); - String id = xceiverClient.getPipeline().getFirstNode().getUuidString(); + final String id = pipeline.getFirstNode().getUuidString(); ContainerCommandRequestProto.Builder builder = ContainerCommandRequestProto.newBuilder().setCmdType(Type.PutBlock) .setContainerID(containerBlockData.getBlockID().getContainerID()) @@ -246,8 +255,7 @@ public static XceiverClientReply putBlockAsync( if (token != null) { builder.setEncodedToken(token.encodeToUrlString()); } - ContainerCommandRequestProto request = builder.build(); - return xceiverClient.sendCommandAsync(request); + return builder.build(); } /** diff --git a/hadoop-hdds/common/src/main/java/org/apache/hadoop/ozone/OzoneConfigKeys.java b/hadoop-hdds/common/src/main/java/org/apache/hadoop/ozone/OzoneConfigKeys.java index 1a47ad9fd8f0..37e28a154a7a 100644 --- a/hadoop-hdds/common/src/main/java/org/apache/hadoop/ozone/OzoneConfigKeys.java +++ b/hadoop-hdds/common/src/main/java/org/apache/hadoop/ozone/OzoneConfigKeys.java @@ -57,6 +57,12 @@ public final class OzoneConfigKeys { public static final boolean DFS_CONTAINER_IPC_RANDOM_PORT_DEFAULT = false; + public static final String DFS_CONTAINER_RATIS_DATASTREAM_RANDOM_PORT = + "dfs.container.ratis.datastream.random.port"; + public static final boolean + DFS_CONTAINER_RATIS_DATASTREAM_RANDOM_PORT_DEFAULT = + false; + public static final String DFS_CONTAINER_CHUNK_WRITE_SYNC_KEY = "dfs.container.chunk.write.sync"; public static final boolean DFS_CONTAINER_CHUNK_WRITE_SYNC_DEFAULT = false; @@ -79,6 +85,25 @@ public final class OzoneConfigKeys { "dfs.container.ratis.server.port"; public static final int DFS_CONTAINER_RATIS_SERVER_PORT_DEFAULT = 9856; + /** + * Ratis Port where containers listen to datastream requests. + */ + public static final String DFS_CONTAINER_RATIS_DATASTREAM_ENABLE + = "dfs.container.ratis.datastream.enable"; + public static final boolean DFS_CONTAINER_RATIS_DATASTREAM_ENABLE_DEFAULT + = false; + public static final String DFS_CONTAINER_RATIS_DATASTREAM_PORT + = "dfs.container.ratis.datastream.port"; + public static final int DFS_CONTAINER_RATIS_DATASTREAM_PORT_DEFAULT + = 9855; + + /** + * Flag to enable ratis streaming on filesystem writes. + */ + public static final String OZONE_FS_DATASTREAM_ENABLE = + "ozone.fs.datastream.enable"; + public static final boolean OZONE_FS_DATASTREAM_ENABLE_DEFAULT = false; + /** * When set to true, allocate a random free port for ozone container, so that * a mini cluster is able to launch multiple containers on a node. diff --git a/hadoop-hdds/common/src/main/java/org/apache/hadoop/ozone/audit/DNAction.java b/hadoop-hdds/common/src/main/java/org/apache/hadoop/ozone/audit/DNAction.java index 1c87f2bdebad..73aff9ac830c 100644 --- a/hadoop-hdds/common/src/main/java/org/apache/hadoop/ozone/audit/DNAction.java +++ b/hadoop-hdds/common/src/main/java/org/apache/hadoop/ozone/audit/DNAction.java @@ -38,7 +38,8 @@ public enum DNAction implements AuditAction { PUT_SMALL_FILE, GET_SMALL_FILE, CLOSE_CONTAINER, - GET_COMMITTED_BLOCK_LENGTH; + GET_COMMITTED_BLOCK_LENGTH, + STREAM_INIT; @Override public String getAction() { diff --git a/hadoop-hdds/common/src/main/java/org/apache/hadoop/ozone/common/Checksum.java b/hadoop-hdds/common/src/main/java/org/apache/hadoop/ozone/common/Checksum.java index 76f84c46ab5e..d300b9ef0e50 100644 --- a/hadoop-hdds/common/src/main/java/org/apache/hadoop/ozone/common/Checksum.java +++ b/hadoop-hdds/common/src/main/java/org/apache/hadoop/ozone/common/Checksum.java @@ -139,6 +139,11 @@ public ChecksumData computeChecksum(byte[] data) */ public ChecksumData computeChecksum(ByteBuffer data) throws OzoneChecksumException { + // If type is set to NONE, we do not need to compute the checksums. We also + // need to avoid unnecessary conversions. + if (checksumType == ChecksumType.NONE) { + return new ChecksumData(checksumType, bytesPerChecksum); + } if (!data.isReadOnly()) { data = data.asReadOnlyBuffer(); } diff --git a/hadoop-hdds/common/src/main/java/org/apache/hadoop/ozone/container/common/helpers/ContainerCommandRequestPBHelper.java b/hadoop-hdds/common/src/main/java/org/apache/hadoop/ozone/container/common/helpers/ContainerCommandRequestPBHelper.java index a13f164eec62..4d7f0f37c4eb 100644 --- a/hadoop-hdds/common/src/main/java/org/apache/hadoop/ozone/container/common/helpers/ContainerCommandRequestPBHelper.java +++ b/hadoop-hdds/common/src/main/java/org/apache/hadoop/ozone/container/common/helpers/ContainerCommandRequestPBHelper.java @@ -187,6 +187,7 @@ public static DNAction getAuditAction(Type cmdType) { case GetSmallFile : return DNAction.GET_SMALL_FILE; case CloseContainer : return DNAction.CLOSE_CONTAINER; case GetCommittedBlockLength : return DNAction.GET_COMMITTED_BLOCK_LENGTH; + case StreamInit : return DNAction.STREAM_INIT; default : LOG.debug("Invalid command type - {}", cmdType); return null; diff --git a/hadoop-hdds/common/src/main/resources/ozone-default.xml b/hadoop-hdds/common/src/main/resources/ozone-default.xml index 21c580b8cd69..a3f631b5133c 100644 --- a/hadoop-hdds/common/src/main/resources/ozone-default.xml +++ b/hadoop-hdds/common/src/main/resources/ozone-default.xml @@ -53,6 +53,26 @@ OZONE, CONTAINER, MANAGEMENT The ipc port number of container. + + dfs.container.ratis.datastream.enable + false + OZONE, CONTAINER, RATIS, DATASTREAM + If enable datastream ipc of container. + + + dfs.container.ratis.datastream.port + 9855 + OZONE, CONTAINER, RATIS, DATASTREAM + The datastream port number of container. + + + dfs.container.ratis.datastream.random.port + false + OZONE, CONTAINER, RATIS, DATASTREAM + Allocates a random free port for ozone container datastream. + This is used only while running unit tests. + + dfs.container.ipc.random.port false @@ -3252,4 +3272,12 @@ If the timeout has been reached, a warning message will be logged. + + + ozone.fs.datastream.enable + false + OZONE, DATANODE + To enable/disable filesystem write via ratis streaming. + + diff --git a/hadoop-hdds/common/src/test/java/org/apache/hadoop/ozone/container/ContainerTestHelper.java b/hadoop-hdds/common/src/test/java/org/apache/hadoop/ozone/container/ContainerTestHelper.java index db8943f2a810..3ad59684828f 100644 --- a/hadoop-hdds/common/src/test/java/org/apache/hadoop/ozone/container/ContainerTestHelper.java +++ b/hadoop-hdds/common/src/test/java/org/apache/hadoop/ozone/container/ContainerTestHelper.java @@ -599,6 +599,18 @@ public static String getFixedLengthString(String string, int length) { return String.format("%1$" + length + "s", string); } + public static byte[] generateData(int length, boolean random) { + final byte[] data = new byte[length]; + if (random) { + ThreadLocalRandom.current().nextBytes(data); + } else { + for (int i = 0; i < length; i++) { + data[i] = (byte) i; + } + } + return data; + } + /** * Construct fake protobuf messages for various types of requests. * This is tedious, however necessary to test. Protobuf classes are final diff --git a/hadoop-hdds/config/src/main/java/org/apache/hadoop/hdds/conf/ConfigTag.java b/hadoop-hdds/config/src/main/java/org/apache/hadoop/hdds/conf/ConfigTag.java index 8cf584d75f61..3728a0b1f590 100644 --- a/hadoop-hdds/config/src/main/java/org/apache/hadoop/hdds/conf/ConfigTag.java +++ b/hadoop-hdds/config/src/main/java/org/apache/hadoop/hdds/conf/ConfigTag.java @@ -46,5 +46,6 @@ public enum ConfigTag { DELETION, HA, BALANCER, - UPGRADE + UPGRADE, + DATASTREAM } diff --git a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/impl/HddsDispatcher.java b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/impl/HddsDispatcher.java index 802104a17140..8ef882bdd487 100644 --- a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/impl/HddsDispatcher.java +++ b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/impl/HddsDispatcher.java @@ -66,6 +66,7 @@ import static org.apache.hadoop.hdds.scm.protocolPB.ContainerCommandResponseBuilders.malformedRequest; import static org.apache.hadoop.hdds.scm.protocolPB.ContainerCommandResponseBuilders.unsupportedRequest; +import org.apache.ratis.statemachine.StateMachine; import org.apache.ratis.thirdparty.com.google.protobuf.ProtocolMessageEnum; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -199,7 +200,8 @@ private ContainerCommandResponseProto dispatchRequest( boolean isWriteStage = (cmdType == Type.WriteChunk && dispatcherContext != null && dispatcherContext.getStage() - == DispatcherContext.WriteChunkStage.WRITE_DATA); + == DispatcherContext.WriteChunkStage.WRITE_DATA) + || (cmdType == Type.StreamInit); boolean isWriteCommitStage = (cmdType == Type.WriteChunk && dispatcherContext != null && dispatcherContext.getStage() @@ -677,4 +679,21 @@ private boolean isAllowed(String action) { default: return false; } } + + @Override + public StateMachine.DataChannel getStreamDataChannel( + ContainerCommandRequestProto msg) + throws StorageContainerException { + long containerID = msg.getContainerID(); + Container container = getContainer(containerID); + if (container != null) { + Handler handler = getHandler(getContainerType(container)); + return handler.getStreamDataChannel(container, msg); + } else { + throw new StorageContainerException( + "ContainerID " + containerID + " does not exist", + ContainerProtos.Result.CONTAINER_NOT_FOUND); + } + } + } diff --git a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/interfaces/ContainerDispatcher.java b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/interfaces/ContainerDispatcher.java index a2e397d54615..d02bae0a35ad 100644 --- a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/interfaces/ContainerDispatcher.java +++ b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/interfaces/ContainerDispatcher.java @@ -25,6 +25,7 @@ .ContainerCommandResponseProto; import org.apache.hadoop.hdds.scm.container.common.helpers.StorageContainerException; import org.apache.hadoop.ozone.container.common.transport.server.ratis.DispatcherContext; +import org.apache.ratis.statemachine.StateMachine; import java.util.Map; @@ -84,4 +85,13 @@ void validateContainerCommand( * @param clusterId */ void setClusterId(String clusterId); + + /** + * When uploading using stream, get StreamDataChannel. + */ + default StateMachine.DataChannel getStreamDataChannel( + ContainerCommandRequestProto msg) throws StorageContainerException { + throw new UnsupportedOperationException( + "getStreamDataChannel not supported."); + } } diff --git a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/interfaces/Handler.java b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/interfaces/Handler.java index 67f977ca387b..a62de490b410 100644 --- a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/interfaces/Handler.java +++ b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/interfaces/Handler.java @@ -37,6 +37,7 @@ import org.apache.hadoop.ozone.container.common.volume.VolumeSet; import org.apache.hadoop.ozone.container.keyvalue.KeyValueHandler; import org.apache.hadoop.ozone.container.keyvalue.TarContainerPacker; +import org.apache.ratis.statemachine.StateMachine; /** * Dispatcher sends ContainerCommandRequests to Handler. Each Container Type @@ -81,6 +82,10 @@ public static Handler getHandlerForContainerType( } } + public abstract StateMachine.DataChannel getStreamDataChannel( + Container container, ContainerCommandRequestProto msg) + throws StorageContainerException; + /** * Returns the Id of this datanode. * diff --git a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/transport/server/ratis/ContainerStateMachine.java b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/transport/server/ratis/ContainerStateMachine.java index 02c0a8d2b152..f6f5a99927ca 100644 --- a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/transport/server/ratis/ContainerStateMachine.java +++ b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/transport/server/ratis/ContainerStateMachine.java @@ -29,6 +29,7 @@ import java.util.Map; import java.util.concurrent.CompletableFuture; import java.util.concurrent.ConcurrentMap; +import java.util.concurrent.CompletionException; import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; @@ -61,6 +62,7 @@ import org.apache.hadoop.ozone.common.utils.BufferUtils; import org.apache.hadoop.ozone.container.common.interfaces.ContainerDispatcher; import org.apache.hadoop.ozone.container.common.statemachine.DatanodeConfiguration; +import org.apache.hadoop.ozone.container.keyvalue.impl.KeyValueStreamDataChannel; import org.apache.hadoop.ozone.container.ozoneimpl.ContainerController; import org.apache.hadoop.util.Time; @@ -80,6 +82,7 @@ import org.apache.ratis.server.protocol.TermIndex; import org.apache.ratis.server.raftlog.RaftLog; import org.apache.ratis.server.storage.RaftStorage; +import org.apache.ratis.statemachine.StateMachine; import org.apache.ratis.statemachine.StateMachineStorage; import org.apache.ratis.statemachine.TransactionContext; import org.apache.ratis.statemachine.impl.BaseStateMachine; @@ -90,6 +93,7 @@ import org.apache.ratis.thirdparty.com.google.protobuf.TextFormat; import org.apache.ratis.util.TaskQueue; import org.apache.ratis.util.function.CheckedSupplier; +import org.apache.ratis.util.JavaUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -423,6 +427,20 @@ private ContainerCommandResponseProto runCommand( return dispatchCommand(requestProto, context); } + private CompletableFuture runCommandAsync( + ContainerCommandRequestProto requestProto, LogEntryProto entry) { + return CompletableFuture.supplyAsync(() -> { + final DispatcherContext context = new DispatcherContext.Builder() + .setTerm(entry.getTerm()) + .setLogIndex(entry.getIndex()) + .setStage(DispatcherContext.WriteChunkStage.COMMIT_DATA) + .setContainer2BCSIDMap(container2BCSIDMap) + .build(); + + return runCommand(requestProto, context); + }, executor); + } + private CompletableFuture handleWriteChunk( ContainerCommandRequestProto requestProto, long entryIndex, long term, long startTime) { @@ -510,6 +528,64 @@ private CompletableFuture handleWriteChunk( return raftFuture; } + private StateMachine.DataChannel getStreamDataChannel( + ContainerCommandRequestProto requestProto, + DispatcherContext context) throws StorageContainerException { + if (LOG.isDebugEnabled()) { + LOG.debug("{}: getStreamDataChannel {} containerID={} pipelineID={} " + + "traceID={}", gid, requestProto.getCmdType(), + requestProto.getContainerID(), requestProto.getPipelineID(), + requestProto.getTraceID()); + } + runCommand(requestProto, context); // stream init + return dispatcher.getStreamDataChannel(requestProto); + } + + @Override + public CompletableFuture stream(RaftClientRequest request) { + return CompletableFuture.supplyAsync(() -> { + try { + ContainerCommandRequestProto requestProto = + message2ContainerCommandRequestProto(request.getMessage()); + DispatcherContext context = + new DispatcherContext.Builder() + .setStage(DispatcherContext.WriteChunkStage.WRITE_DATA) + .setContainer2BCSIDMap(container2BCSIDMap) + .build(); + DataChannel channel = getStreamDataChannel(requestProto, context); + final ExecutorService chunkExecutor = requestProto.hasWriteChunk() ? + getChunkExecutor(requestProto.getWriteChunk()) : null; + return new LocalStream(channel, chunkExecutor); + } catch (IOException e) { + throw new CompletionException("Failed to create data stream", e); + } + }, executor); + } + + @Override + public CompletableFuture link(DataStream stream, LogEntryProto entry) { + if (stream == null) { + return JavaUtils.completeExceptionally(new IllegalStateException( + "DataStream is null")); + } + final DataChannel dataChannel = stream.getDataChannel(); + if (dataChannel.isOpen()) { + return JavaUtils.completeExceptionally(new IllegalStateException( + "DataStream: " + stream + " is not closed properly")); + } + + final ContainerCommandRequestProto request; + if (dataChannel instanceof KeyValueStreamDataChannel) { + request = ((KeyValueStreamDataChannel) dataChannel).getPutBlockRequest(); + } else { + return JavaUtils.completeExceptionally(new IllegalStateException( + "Unexpected DataChannel " + dataChannel.getClass())); + } + return runCommandAsync(request, entry).whenComplete( + (res, e) -> LOG.debug("link {}, entry: {}, request: {}", + res.getResult(), entry, request)); + } + private ExecutorService getChunkExecutor(WriteChunkRequestProto req) { int i = (int)(req.getBlockID().getLocalID() % chunkExecutors.size()); return chunkExecutors.get(i); @@ -803,7 +879,8 @@ public CompletableFuture applyTransaction(TransactionContext trx) { builder.setStage(DispatcherContext.WriteChunkStage.COMMIT_DATA); } if (cmdType == Type.WriteChunk || cmdType == Type.PutSmallFile - || cmdType == Type.PutBlock || cmdType == Type.CreateContainer) { + || cmdType == Type.PutBlock || cmdType == Type.CreateContainer + || cmdType == Type.StreamInit) { builder.setContainer2BCSIDMap(container2BCSIDMap); } CompletableFuture applyTransactionFuture = diff --git a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/transport/server/ratis/LocalStream.java b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/transport/server/ratis/LocalStream.java new file mode 100644 index 000000000000..780f8743988a --- /dev/null +++ b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/transport/server/ratis/LocalStream.java @@ -0,0 +1,58 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.ozone.container.common.transport.server.ratis; + +import org.apache.ratis.statemachine.StateMachine; + +import java.io.IOException; +import java.util.concurrent.CompletableFuture; +import java.util.concurrent.CompletionException; +import java.util.concurrent.Executor; + +class LocalStream implements StateMachine.DataStream { + private final StateMachine.DataChannel dataChannel; + private final Executor executor; + + LocalStream(StateMachine.DataChannel dataChannel, Executor executor) { + this.dataChannel = dataChannel; + this.executor = executor; + } + + @Override + public StateMachine.DataChannel getDataChannel() { + return dataChannel; + } + + @Override + public CompletableFuture cleanUp() { + return CompletableFuture.supplyAsync(() -> { + try { + dataChannel.close(); + return true; + } catch (IOException e) { + throw new CompletionException("Failed to close data channel", e); + } + }); + } + + @Override + public Executor getExecutor() { + return executor; + } +} \ No newline at end of file diff --git a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/transport/server/ratis/XceiverServerRatis.java b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/transport/server/ratis/XceiverServerRatis.java index c8d715cc60d2..6b0ad0e41e8a 100644 --- a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/transport/server/ratis/XceiverServerRatis.java +++ b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/transport/server/ratis/XceiverServerRatis.java @@ -98,6 +98,7 @@ import org.apache.ratis.protocol.RaftPeerId; import org.apache.ratis.rpc.RpcType; import org.apache.ratis.rpc.SupportedRpcType; +import org.apache.ratis.server.DataStreamServerRpc; import org.apache.ratis.server.RaftServer; import org.apache.ratis.server.RaftServerConfigKeys; import org.apache.ratis.server.RaftServerRpc; @@ -129,6 +130,7 @@ private static long nextCallId() { private int serverPort; private int adminPort; private int clientPort; + private int dataStreamPort; private final RaftServer server; private final List chunkExecutors; private final ContainerDispatcher dispatcher; @@ -148,6 +150,7 @@ private static long nextCallId() { // Timeout used while calling submitRequest directly. private long requestTimeout; private boolean shouldDeleteRatisLogDirectory; + private boolean streamEnable; private XceiverServerRatis(DatanodeDetails dd, ContainerDispatcher dispatcher, ContainerController containerController, @@ -157,6 +160,9 @@ private XceiverServerRatis(DatanodeDetails dd, Objects.requireNonNull(dd, "id == null"); datanodeDetails = dd; assignPorts(); + this.streamEnable = conf.getBoolean( + OzoneConfigKeys.DFS_CONTAINER_RATIS_DATASTREAM_ENABLE, + OzoneConfigKeys.DFS_CONTAINER_RATIS_DATASTREAM_ENABLE_DEFAULT); RaftProperties serverProperties = newRaftProperties(); this.context = context; this.dispatcher = dispatcher; @@ -213,6 +219,32 @@ private ContainerStateMachine getStateMachine(RaftGroupId gid) { chunkExecutors, this, conf); } + private void setUpRatisStream(RaftProperties properties) { + // set the datastream config + if (conf.getBoolean( + OzoneConfigKeys.DFS_CONTAINER_RATIS_DATASTREAM_RANDOM_PORT, + OzoneConfigKeys. + DFS_CONTAINER_RATIS_DATASTREAM_RANDOM_PORT_DEFAULT)) { + dataStreamPort = 0; + } else { + dataStreamPort = conf.getInt( + OzoneConfigKeys.DFS_CONTAINER_RATIS_DATASTREAM_PORT, + OzoneConfigKeys.DFS_CONTAINER_RATIS_DATASTREAM_PORT_DEFAULT); + } + NettyConfigKeys.DataStream.setPort(properties, dataStreamPort); + int dataStreamAsyncRequestThreadPoolSize = + conf.getObject(DatanodeRatisServerConfig.class) + .getStreamRequestThreads(); + RaftServerConfigKeys.DataStream.setAsyncRequestThreadPoolSize(properties, + dataStreamAsyncRequestThreadPoolSize); + int dataStreamClientPoolSize = + conf.getObject(DatanodeRatisServerConfig.class) + .getClientPoolSize(); + RaftServerConfigKeys.DataStream.setClientPoolSize(properties, + dataStreamClientPoolSize); + } + + @SuppressWarnings("checkstyle:methodlength") private RaftProperties newRaftProperties() { final RaftProperties properties = new RaftProperties(); @@ -231,6 +263,10 @@ private RaftProperties newRaftProperties() { // set the configs enable and set the stateMachineData sync timeout RaftServerConfigKeys.Log.StateMachineData.setSync(properties, true); + if (streamEnable) { + setUpRatisStream(properties); + } + timeUnit = OzoneConfigKeys. DFS_CONTAINER_RATIS_STATEMACHINEDATA_SYNC_TIMEOUT_DEFAULT.getUnit(); duration = conf.getTimeDuration( @@ -491,7 +527,12 @@ public void start() throws IOException { Port.Name.RATIS_ADMIN); serverPort = getRealPort(serverRpc.getInetSocketAddress(), Port.Name.RATIS_SERVER); - + if (streamEnable) { + DataStreamServerRpc dataStreamServerRpc = + server.getDataStreamServerRpc(); + dataStreamPort = getRealPort(dataStreamServerRpc.getInetSocketAddress(), + Port.Name.RATIS_DATASTREAM); + } isStarted = true; } } diff --git a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/keyvalue/KeyValueHandler.java b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/keyvalue/KeyValueHandler.java index 7fcbdb3e7f4c..8bfe1623d6fc 100644 --- a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/keyvalue/KeyValueHandler.java +++ b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/keyvalue/KeyValueHandler.java @@ -100,11 +100,13 @@ import static org.apache.hadoop.hdds.scm.protocolPB.ContainerCommandResponseBuilders.getReadChunkResponse; import static org.apache.hadoop.hdds.scm.protocolPB.ContainerCommandResponseBuilders.getReadContainerResponse; import static org.apache.hadoop.hdds.scm.protocolPB.ContainerCommandResponseBuilders.getSuccessResponse; +import static org.apache.hadoop.hdds.scm.protocolPB.ContainerCommandResponseBuilders.getSuccessResponseBuilder; import static org.apache.hadoop.hdds.scm.protocolPB.ContainerCommandResponseBuilders.malformedRequest; import static org.apache.hadoop.hdds.scm.protocolPB.ContainerCommandResponseBuilders.putBlockResponseSuccess; import static org.apache.hadoop.hdds.scm.protocolPB.ContainerCommandResponseBuilders.unsupportedRequest; import static org.apache.hadoop.hdds.scm.utils.ClientCommandsUtils.getReadChunkVersion; +import org.apache.ratis.statemachine.StateMachine; import org.apache.ratis.thirdparty.com.google.protobuf.ByteString; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -178,6 +180,25 @@ public VolumeChoosingPolicy getVolumeChoosingPolicyForTesting() { return volumeChoosingPolicy; } + @Override + public StateMachine.DataChannel getStreamDataChannel( + Container container, ContainerCommandRequestProto msg) + throws StorageContainerException { + KeyValueContainer kvContainer = (KeyValueContainer) container; + checkContainerOpen(kvContainer); + + if (msg.hasWriteChunk()) { + BlockID blockID = + BlockID.getFromProtobuf(msg.getWriteChunk().getBlockID()); + + return chunkManager.getStreamDataChannel(kvContainer, + blockID, metrics); + } else { + throw new StorageContainerException("Malformed request.", + ContainerProtos.Result.IO_EXCEPTION); + } + } + @Override public void stop() { chunkManager.shutdown(); @@ -227,6 +248,8 @@ static ContainerCommandResponseProto dispatchRequest(KeyValueHandler handler, return handler.handleDeleteChunk(request, kvContainer); case WriteChunk: return handler.handleWriteChunk(request, kvContainer, dispatcherContext); + case StreamInit: + return handler.handleStreamInit(request, kvContainer, dispatcherContext); case ListChunk: return handler.handleUnsupportedOp(request); case CompactChunk: @@ -253,6 +276,35 @@ public BlockManager getBlockManager() { return this.blockManager; } + ContainerCommandResponseProto handleStreamInit( + ContainerCommandRequestProto request, KeyValueContainer kvContainer, + DispatcherContext dispatcherContext) { + final BlockID blockID; + if (request.hasWriteChunk()) { + WriteChunkRequestProto writeChunk = request.getWriteChunk(); + blockID = BlockID.getFromProtobuf(writeChunk.getBlockID()); + } else { + if (LOG.isDebugEnabled()) { + LOG.debug("Malformed {} request. trace ID: {}", + request.getCmdType(), request.getTraceID()); + } + return malformedRequest(request); + } + + String path = null; + try { + checkContainerOpen(kvContainer); + path = chunkManager + .streamInit(kvContainer, blockID); + } catch (StorageContainerException ex) { + return ContainerUtils.logAndReturnError(LOG, ex, request); + } + + return getSuccessResponseBuilder(request) + .setMessage(path) + .build(); + } + /** * Handles Create Container Request. If successful, adds the container to * ContainerSet and sends an ICR to the SCM. diff --git a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/keyvalue/impl/ChunkManagerDispatcher.java b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/keyvalue/impl/ChunkManagerDispatcher.java index 763647313b8c..92f6327447ab 100644 --- a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/keyvalue/impl/ChunkManagerDispatcher.java +++ b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/keyvalue/impl/ChunkManagerDispatcher.java @@ -25,6 +25,7 @@ import org.apache.hadoop.ozone.common.ChunkBuffer; import org.apache.hadoop.ozone.container.common.helpers.BlockData; import org.apache.hadoop.ozone.container.common.helpers.ChunkInfo; +import org.apache.hadoop.ozone.container.common.helpers.ContainerMetrics; import org.apache.hadoop.ozone.container.common.transport.server.ratis.DispatcherContext; import org.apache.hadoop.ozone.container.common.impl.ContainerLayoutVersion; import org.apache.hadoop.ozone.container.common.volume.VolumeSet; @@ -33,6 +34,7 @@ import org.apache.hadoop.ozone.container.keyvalue.interfaces.ChunkManager; import org.apache.hadoop.ozone.container.common.interfaces.Container; +import org.apache.ratis.statemachine.StateMachine; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -73,6 +75,20 @@ public void writeChunk(Container container, BlockID blockID, ChunkInfo info, .writeChunk(container, blockID, info, data, dispatcherContext); } + public String streamInit(Container container, BlockID blockID) + throws StorageContainerException { + return selectHandler(container) + .streamInit(container, blockID); + } + + @Override + public StateMachine.DataChannel getStreamDataChannel( + Container container, BlockID blockID, ContainerMetrics metrics) + throws StorageContainerException { + return selectHandler(container) + .getStreamDataChannel(container, blockID, metrics); + } + @Override public void finishWriteChunks(KeyValueContainer kvContainer, BlockData blockData) throws IOException { diff --git a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/keyvalue/impl/FilePerBlockStrategy.java b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/keyvalue/impl/FilePerBlockStrategy.java index 51cd5708d3e6..23db342da030 100644 --- a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/keyvalue/impl/FilePerBlockStrategy.java +++ b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/keyvalue/impl/FilePerBlockStrategy.java @@ -32,6 +32,7 @@ import org.apache.hadoop.ozone.common.utils.BufferUtils; import org.apache.hadoop.ozone.container.common.helpers.BlockData; import org.apache.hadoop.ozone.container.common.helpers.ChunkInfo; +import org.apache.hadoop.ozone.container.common.helpers.ContainerMetrics; import org.apache.hadoop.ozone.container.common.transport.server.ratis.DispatcherContext; import org.apache.hadoop.ozone.container.common.volume.VolumeSet; import org.apache.hadoop.ozone.container.keyvalue.KeyValueContainer; @@ -42,6 +43,7 @@ import org.apache.hadoop.ozone.container.keyvalue.interfaces.ChunkManager; import org.apache.hadoop.ozone.container.common.interfaces.Container; +import org.apache.ratis.statemachine.StateMachine; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -89,6 +91,24 @@ private static void checkLayoutVersion(Container container) { container.getContainerData().getLayoutVersion() == FILE_PER_BLOCK); } + @Override + public String streamInit(Container container, BlockID blockID) + throws StorageContainerException { + checkLayoutVersion(container); + File chunkFile = getChunkFile(container, blockID, null); + return chunkFile.getAbsolutePath(); + } + + @Override + public StateMachine.DataChannel getStreamDataChannel( + Container container, BlockID blockID, ContainerMetrics metrics) + throws StorageContainerException { + checkLayoutVersion(container); + File chunkFile = getChunkFile(container, blockID, null); + return new KeyValueStreamDataChannel(chunkFile, + container.getContainerData(), metrics); + } + @Override public void writeChunk(Container container, BlockID blockID, ChunkInfo info, ChunkBuffer data, DispatcherContext dispatcherContext) diff --git a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/keyvalue/impl/KeyValueStreamDataChannel.java b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/keyvalue/impl/KeyValueStreamDataChannel.java new file mode 100644 index 000000000000..99dc40f5d002 --- /dev/null +++ b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/keyvalue/impl/KeyValueStreamDataChannel.java @@ -0,0 +1,277 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.ozone.container.keyvalue.impl; + +import com.google.common.base.Preconditions; +import org.apache.hadoop.hdds.protocol.datanode.proto.ContainerProtos; +import org.apache.hadoop.hdds.protocol.datanode.proto.ContainerProtos.ContainerCommandRequestProto; +import org.apache.hadoop.hdds.ratis.ContainerCommandRequestMessage; +import org.apache.hadoop.hdds.ratis.RatisHelper; +import org.apache.hadoop.hdds.scm.container.common.helpers.StorageContainerException; +import org.apache.hadoop.hdds.scm.storage.BlockDataStreamOutput; +import org.apache.hadoop.ozone.container.common.helpers.ContainerMetrics; +import org.apache.hadoop.ozone.container.common.impl.ContainerData; +import org.apache.ratis.thirdparty.com.google.protobuf.ByteString; +import org.apache.ratis.thirdparty.io.netty.buffer.ByteBuf; +import org.apache.ratis.thirdparty.io.netty.buffer.Unpooled; +import org.apache.ratis.util.ReferenceCountedObject; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.File; +import java.io.IOException; +import java.nio.ByteBuffer; +import java.util.ArrayList; +import java.util.Deque; +import java.util.Iterator; +import java.util.LinkedList; +import java.util.List; +import java.util.Objects; +import java.util.Optional; +import java.util.concurrent.atomic.AtomicBoolean; +import java.util.concurrent.atomic.AtomicReference; + +/** + * This class is used to get the DataChannel for streaming. + */ +public class KeyValueStreamDataChannel extends StreamDataChannelBase { + public static final Logger LOG = + LoggerFactory.getLogger(KeyValueStreamDataChannel.class); + + /** + * Keep the last {@link Buffers#max} bytes in the buffer + * in order to create putBlockRequest + * at {@link #closeBuffers(Buffers, WriteMethod)}}. + */ + static class Buffers { + private final Deque> deque + = new LinkedList<>(); + private final int max; + private int length; + + Buffers(int max) { + this.max = max; + } + + private boolean isExtra(int n) { + return length - n >= max; + } + + private boolean hasExtraBuffer() { + return Optional.ofNullable(deque.peek()) + .map(ReferenceCountedObject::get) + .filter(b -> isExtra(b.remaining())) + .isPresent(); + } + + /** + * @return extra buffers which are safe to be written. + */ + Iterable> offer( + ReferenceCountedObject ref) { + final ByteBuffer buffer = ref.retain(); + LOG.debug("offer {}", buffer); + final boolean offered = deque.offer(ref); + Preconditions.checkState(offered, "Failed to offer"); + length += buffer.remaining(); + + return () -> new Iterator>() { + @Override + public boolean hasNext() { + return hasExtraBuffer(); + } + + @Override + public ReferenceCountedObject next() { + final ReferenceCountedObject polled = poll(); + length -= polled.get().remaining(); + Preconditions.checkState(length >= max); + return polled; + } + }; + } + + ReferenceCountedObject poll() { + final ReferenceCountedObject polled + = Objects.requireNonNull(deque.poll()); + RatisHelper.debug(polled.get(), "polled", LOG); + return polled; + } + + ReferenceCountedObject pollAll() { + Preconditions.checkState(!deque.isEmpty(), "The deque is empty"); + final ByteBuffer[] array = new ByteBuffer[deque.size()]; + final List> refs + = new ArrayList<>(deque.size()); + for (int i = 0; i < array.length; i++) { + final ReferenceCountedObject ref = poll(); + refs.add(ref); + array[i] = ref.get(); + } + final ByteBuf buf = Unpooled.wrappedBuffer(array).asReadOnly(); + return ReferenceCountedObject.wrap(buf, () -> { + }, () -> { + buf.release(); + refs.forEach(ReferenceCountedObject::release); + }); + } + } + + interface WriteMethod { + int applyAsInt(ByteBuffer src) throws IOException; + } + + private final Buffers buffers = new Buffers( + BlockDataStreamOutput.PUT_BLOCK_REQUEST_LENGTH_MAX); + private final AtomicReference putBlockRequest + = new AtomicReference<>(); + private final AtomicBoolean closed = new AtomicBoolean(); + + KeyValueStreamDataChannel(File file, ContainerData containerData, + ContainerMetrics metrics) + throws StorageContainerException { + super(file, containerData, metrics); + } + + @Override + ContainerProtos.Type getType() { + return ContainerProtos.Type.StreamWrite; + } + + @Override + public int write(ReferenceCountedObject referenceCounted) + throws IOException { + assertOpen(); + return writeBuffers(referenceCounted, buffers, super::writeFileChannel); + } + + static int writeBuffers(ReferenceCountedObject src, + Buffers buffers, WriteMethod writeMethod) + throws IOException { + for (ReferenceCountedObject b : buffers.offer(src)) { + try { + writeFully(b.get(), writeMethod); + } finally { + b.release(); + } + } + return src.get().remaining(); + } + + private static void writeFully(ByteBuffer b, WriteMethod writeMethod) + throws IOException { + for (; b.remaining() > 0;) { + final int written = writeMethod.applyAsInt(b); + if (written <= 0) { + throw new IOException("Unable to write"); + } + } + } + + public ContainerCommandRequestProto getPutBlockRequest() { + return Objects.requireNonNull(putBlockRequest.get(), + () -> "putBlockRequest == null, " + this); + } + + void assertOpen() throws IOException { + if (closed.get()) { + throw new IOException("Already closed: " + this); + } + } + + @Override + public void close() throws IOException { + if (closed.compareAndSet(false, true)) { + putBlockRequest.set(closeBuffers(buffers, super::writeFileChannel)); + super.close(); + } + } + + static ContainerCommandRequestProto closeBuffers( + Buffers buffers, WriteMethod writeMethod) throws IOException { + final ReferenceCountedObject ref = buffers.pollAll(); + final ByteBuf buf = ref.retain(); + final ContainerCommandRequestProto putBlockRequest; + try { + putBlockRequest = readPutBlockRequest(buf); + // write the remaining data + writeFully(buf.nioBuffer(), writeMethod); + } finally { + ref.release(); + } + return putBlockRequest; + } + + private static int readProtoLength(ByteBuf b, int lengthIndex) { + final int readerIndex = b.readerIndex(); + LOG.debug("{}, lengthIndex = {}, readerIndex = {}", + b, lengthIndex, readerIndex); + if (lengthIndex > readerIndex) { + b.readerIndex(lengthIndex); + } else { + Preconditions.checkState(lengthIndex == readerIndex); + } + RatisHelper.debug(b, "readProtoLength", LOG); + return b.nioBuffer().getInt(); + } + + static ContainerCommandRequestProto readPutBlockRequest(ByteBuf b) + throws IOException { + // readerIndex protoIndex lengthIndex readerIndex+readableBytes + // V V V V + // format: |--- data ---|--- proto ---|--- proto length (4 bytes) ---| + final int readerIndex = b.readerIndex(); + final int lengthIndex = readerIndex + b.readableBytes() - 4; + final int protoLength = readProtoLength(b.duplicate(), lengthIndex); + final int protoIndex = lengthIndex - protoLength; + + final ContainerCommandRequestProto proto; + try { + proto = readPutBlockRequest(b.slice(protoIndex, protoLength).nioBuffer()); + } catch (Throwable t) { + RatisHelper.debug(b, "catch", LOG); + throw new IOException("Failed to readPutBlockRequest from " + b + + ": readerIndex=" + readerIndex + + ", protoIndex=" + protoIndex + + ", protoLength=" + protoLength + + ", lengthIndex=" + lengthIndex, t); + } + + // set index for reading data + b.writerIndex(protoIndex); + + return proto; + } + + private static ContainerCommandRequestProto readPutBlockRequest(ByteBuffer b) + throws IOException { + RatisHelper.debug(b, "readPutBlockRequest", LOG); + final ByteString byteString = ByteString.copyFrom(b); + + final ContainerCommandRequestProto request = + ContainerCommandRequestMessage.toProto(byteString, null); + + if (!request.hasPutBlock()) { + throw new StorageContainerException( + "Malformed PutBlock request. trace ID: " + request.getTraceID(), + ContainerProtos.Result.MALFORMED_REQUEST); + } + return request; + } +} diff --git a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/keyvalue/impl/StreamDataChannelBase.java b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/keyvalue/impl/StreamDataChannelBase.java new file mode 100644 index 000000000000..982903324848 --- /dev/null +++ b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/keyvalue/impl/StreamDataChannelBase.java @@ -0,0 +1,96 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.ozone.container.keyvalue.impl; + +import org.apache.hadoop.hdds.protocol.datanode.proto.ContainerProtos; +import org.apache.hadoop.hdds.scm.container.common.helpers.StorageContainerException; +import org.apache.hadoop.ozone.container.common.helpers.ContainerMetrics; +import org.apache.hadoop.ozone.container.common.impl.ContainerData; +import org.apache.ratis.statemachine.StateMachine; + +import java.io.File; +import java.io.FileNotFoundException; +import java.io.IOException; +import java.io.RandomAccessFile; +import java.nio.ByteBuffer; +import java.nio.channels.FileChannel; + +/** + * For write state machine data. + */ +abstract class StreamDataChannelBase implements StateMachine.DataChannel { + private final RandomAccessFile randomAccessFile; + + private final File file; + + private final ContainerData containerData; + private final ContainerMetrics metrics; + + StreamDataChannelBase(File file, ContainerData containerData, + ContainerMetrics metrics) + throws StorageContainerException { + try { + this.file = file; + this.randomAccessFile = new RandomAccessFile(file, "rw"); + } catch (FileNotFoundException e) { + throw new StorageContainerException("BlockFile not exists with " + + "container Id " + containerData.getContainerID() + + " file " + file.getAbsolutePath(), + ContainerProtos.Result.IO_EXCEPTION); + } + this.containerData = containerData; + this.metrics = metrics; + } + + abstract ContainerProtos.Type getType(); + + private FileChannel getChannel() { + return randomAccessFile.getChannel(); + } + + @Override + public final void force(boolean metadata) throws IOException { + getChannel().force(metadata); + } + + @Override + public final boolean isOpen() { + return getChannel().isOpen(); + } + + @Override + public void close() throws IOException { + randomAccessFile.close(); + } + + final int writeFileChannel(ByteBuffer src) throws IOException { + final int writeBytes = getChannel().write(src); + metrics.incContainerBytesStats(getType(), writeBytes); + containerData.updateWriteStats(writeBytes, false); + return writeBytes; + } + + @Override + public String toString() { + return getClass().getSimpleName() + "{" + + "File=" + file.getAbsolutePath() + + ", containerID=" + containerData.getContainerID() + + '}'; + } +} diff --git a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/keyvalue/interfaces/ChunkManager.java b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/keyvalue/interfaces/ChunkManager.java index 15ff9d6b9d61..7a64f076281b 100644 --- a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/keyvalue/interfaces/ChunkManager.java +++ b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/keyvalue/interfaces/ChunkManager.java @@ -25,9 +25,11 @@ import org.apache.hadoop.ozone.common.ChunkBuffer; import org.apache.hadoop.ozone.container.common.helpers.BlockData; import org.apache.hadoop.ozone.container.common.helpers.ChunkInfo; +import org.apache.hadoop.ozone.container.common.helpers.ContainerMetrics; import org.apache.hadoop.ozone.container.common.interfaces.Container; import org.apache.hadoop.ozone.container.common.transport.server.ratis.DispatcherContext; import org.apache.hadoop.ozone.container.keyvalue.KeyValueContainer; +import org.apache.ratis.statemachine.StateMachine; import java.io.IOException; import java.nio.ByteBuffer; @@ -104,6 +106,17 @@ default void finishWriteChunks(KeyValueContainer kvContainer, // no-op } + default String streamInit(Container container, BlockID blockID) + throws StorageContainerException { + return null; + } + + default StateMachine.DataChannel getStreamDataChannel( + Container container, BlockID blockID, ContainerMetrics metrics) + throws StorageContainerException { + return null; + } + static long getBufferCapacityForChunkRead(ChunkInfo chunkInfo, long defaultReadBufferCapacity) { long bufferCapacity = 0; diff --git a/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/common/TestDatanodeStateMachine.java b/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/common/TestDatanodeStateMachine.java index 1337f28ad945..bb1145bb2b2d 100644 --- a/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/common/TestDatanodeStateMachine.java +++ b/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/common/TestDatanodeStateMachine.java @@ -81,6 +81,8 @@ public void setUp() throws Exception { TimeUnit.MILLISECONDS); conf.setBoolean(OzoneConfigKeys.DFS_CONTAINER_RATIS_IPC_RANDOM_PORT, true); conf.setBoolean(OzoneConfigKeys.DFS_CONTAINER_IPC_RANDOM_PORT, true); + conf.setBoolean( + OzoneConfigKeys.DFS_CONTAINER_RATIS_DATASTREAM_RANDOM_PORT, true); serverAddresses = new ArrayList<>(); scmServers = new ArrayList<>(); mockServers = new ArrayList<>(); @@ -215,7 +217,6 @@ public void testDatanodeStateContext() throws IOException, OzoneConfigKeys.DFS_CONTAINER_IPC_PORT_DEFAULT); datanodeDetails.setPort(port); ContainerUtils.writeDatanodeDetailsTo(datanodeDetails, idPath); - try (DatanodeStateMachine stateMachine = new DatanodeStateMachine(datanodeDetails, conf, null, null, null)) { @@ -424,6 +425,8 @@ private DatanodeDetails getNewDatanodeDetails() { DatanodeDetails.Port.Name.RATIS, 0); DatanodeDetails.Port restPort = DatanodeDetails.newPort( DatanodeDetails.Port.Name.REST, 0); + DatanodeDetails.Port streamPort = DatanodeDetails.newPort( + DatanodeDetails.Port.Name.RATIS_DATASTREAM, 0); return DatanodeDetails.newBuilder() .setUuid(UUID.randomUUID()) .setHostName("localhost") @@ -431,6 +434,7 @@ private DatanodeDetails getNewDatanodeDetails() { .addPort(containerPort) .addPort(ratisPort) .addPort(restPort) + .addPort(streamPort) .build(); } } diff --git a/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/keyvalue/impl/TestKeyValueStreamDataChannel.java b/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/keyvalue/impl/TestKeyValueStreamDataChannel.java new file mode 100644 index 000000000000..d252b1cb1bef --- /dev/null +++ b/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/keyvalue/impl/TestKeyValueStreamDataChannel.java @@ -0,0 +1,313 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.ozone.container.keyvalue.impl; + +import org.apache.hadoop.hdds.protocol.datanode.proto.ContainerProtos.BlockData; +import org.apache.hadoop.hdds.protocol.datanode.proto.ContainerProtos.ContainerCommandRequestProto; +import org.apache.hadoop.hdds.protocol.datanode.proto.ContainerProtos.DatanodeBlockID; +import org.apache.hadoop.hdds.protocol.datanode.proto.ContainerProtos.PutBlockRequestProto; +import org.apache.hadoop.hdds.protocol.datanode.proto.ContainerProtos.Type; +import org.apache.hadoop.hdds.ratis.ContainerCommandRequestMessage; +import org.apache.hadoop.ozone.container.keyvalue.impl.KeyValueStreamDataChannel.Buffers; +import org.apache.hadoop.ozone.container.keyvalue.impl.KeyValueStreamDataChannel.WriteMethod; +import org.apache.ratis.client.api.DataStreamOutput; +import org.apache.ratis.io.FilePositionCount; +import org.apache.ratis.io.StandardWriteOption; +import org.apache.ratis.io.WriteOption; +import org.apache.ratis.proto.RaftProtos.CommitInfoProto; +import org.apache.ratis.proto.RaftProtos.DataStreamPacketHeaderProto; +import org.apache.ratis.protocol.ClientId; +import org.apache.ratis.protocol.DataStreamReply; +import org.apache.ratis.protocol.RaftClientReply; +import org.apache.ratis.thirdparty.io.netty.buffer.ByteBuf; +import org.apache.ratis.thirdparty.io.netty.buffer.Unpooled; +import org.apache.ratis.util.ReferenceCountedObject; +import org.junit.Assert; +import org.junit.Test; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.nio.ByteBuffer; +import java.nio.channels.WritableByteChannel; +import java.util.ArrayList; +import java.util.Collection; +import java.util.List; +import java.util.Random; +import java.util.concurrent.CompletableFuture; +import java.util.concurrent.CompletionException; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; +import java.util.concurrent.ThreadLocalRandom; + +import static org.apache.hadoop.hdds.scm.storage.BlockDataStreamOutput.PUT_BLOCK_REQUEST_LENGTH_MAX; +import static org.apache.hadoop.hdds.scm.storage.BlockDataStreamOutput.executePutBlockClose; +import static org.apache.hadoop.hdds.scm.storage.BlockDataStreamOutput.getProtoLength; +import static org.apache.hadoop.ozone.container.keyvalue.impl.KeyValueStreamDataChannel.closeBuffers; +import static org.apache.hadoop.ozone.container.keyvalue.impl.KeyValueStreamDataChannel.readPutBlockRequest; +import static org.apache.hadoop.ozone.container.keyvalue.impl.KeyValueStreamDataChannel.writeBuffers; + +/** For testing {@link KeyValueStreamDataChannel}. */ +public class TestKeyValueStreamDataChannel { + public static final Logger LOG = + LoggerFactory.getLogger(TestKeyValueStreamDataChannel.class); + + static final ContainerCommandRequestProto PUT_BLOCK_PROTO + = ContainerCommandRequestProto.newBuilder() + .setCmdType(Type.PutBlock) + .setPutBlock(PutBlockRequestProto.newBuilder().setBlockData( + BlockData.newBuilder().setBlockID(DatanodeBlockID.newBuilder() + .setContainerID(222).setLocalID(333).build()).build())) + .setDatanodeUuid("datanodeId") + .setContainerID(111L) + .build(); + static final int PUT_BLOCK_PROTO_SIZE = PUT_BLOCK_PROTO.toByteString().size(); + static { + LOG.info("PUT_BLOCK_PROTO_SIZE = {}", PUT_BLOCK_PROTO_SIZE); + } + + @Test + public void testSerialization() throws Exception { + final int max = PUT_BLOCK_REQUEST_LENGTH_MAX; + final ByteBuffer putBlockBuf = ContainerCommandRequestMessage.toMessage( + PUT_BLOCK_PROTO, null).getContent().asReadOnlyByteBuffer(); + final ByteBuffer protoLengthBuf = getProtoLength(putBlockBuf, max); + + // random data size + final int dataSize = ThreadLocalRandom.current().nextInt(1000) + 100; + final byte[] data = new byte[dataSize]; + + //serialize + final ByteBuf buf = Unpooled.buffer(max); + buf.writeBytes(data); + buf.writeBytes(putBlockBuf); + buf.writeBytes(protoLengthBuf); + + final ContainerCommandRequestProto proto = readPutBlockRequest(buf); + Assert.assertEquals(PUT_BLOCK_PROTO, proto); + } + + @Test + public void testBuffers() throws Exception { + final ExecutorService executor = Executors.newFixedThreadPool(32); + final List> futures = new ArrayList<>(); + + final int min = PUT_BLOCK_PROTO_SIZE + 4; + final int[] maxValues = {min, 2 * min, 10 * min}; + final int[] dataSizes = {0, 10, 100, 10_000}; + for (int max : maxValues) { + for (int dataSize : dataSizes) { + futures.add(CompletableFuture.supplyAsync( + () -> runTestBuffers(dataSize, max), executor)); + } + } + + for (CompletableFuture f : futures) { + f.get(); + } + } + + static String runTestBuffers(int dataSize, int max) { + final int seed = ThreadLocalRandom.current().nextInt(); + final String name = String.format("[dataSize=%d,max=%d,seed=%H]", + dataSize, max, seed); + LOG.info(name); + try { + runTestBuffers(dataSize, max, seed, name); + } catch (Throwable t) { + throw new CompletionException("Failed " + name, t); + } + return name; + } + + static void runTestBuffers(int dataSize, int max, int seed, String name) + throws Exception { + Assert.assertTrue(max >= PUT_BLOCK_PROTO_SIZE); + + // random data + final byte[] data = new byte[dataSize]; + final Random random = new Random(seed); + random.nextBytes(data); + + // write output + final Buffers buffers = new Buffers(max); + final Output out = new Output(buffers); + for (int offset = 0; offset < dataSize;) { + final int randomLength = random.nextInt(4 * max); + final int length = Math.min(randomLength, dataSize - offset); + LOG.info("{}: offset = {}, length = {}", name, offset, length); + final ByteBuffer b = ByteBuffer.wrap(data, offset, length); + final DataStreamReply writeReply = out.writeAsync(b).get(); + assertReply(writeReply, length, null); + offset += length; + } + + // close + final DataStreamReply closeReply = executePutBlockClose( + PUT_BLOCK_PROTO, max, out).get(); + assertReply(closeReply, 0, PUT_BLOCK_PROTO); + + // check output + final ByteBuf outBuf = out.getOutBuf(); + LOG.info("outBuf = {}", outBuf); + Assert.assertEquals(dataSize, outBuf.readableBytes()); + for (int i = 0; i < dataSize; i++) { + Assert.assertEquals(data[i], outBuf.readByte()); + } + outBuf.release(); + } + + static void assertReply(DataStreamReply reply, int byteWritten, + ContainerCommandRequestProto proto) { + Assert.assertTrue(reply.isSuccess()); + Assert.assertEquals(byteWritten, reply.getBytesWritten()); + Assert.assertEquals(proto, ((Reply)reply).getPutBlockRequest()); + } + + static class Output implements DataStreamOutput { + private final Buffers buffers; + private final ByteBuf outBuf = Unpooled.buffer(); + private final WriteMethod writeMethod = src -> { + final int remaining = src.remaining(); + outBuf.writeBytes(src); + return remaining; + }; + + Output(Buffers buffers) { + this.buffers = buffers; + } + + ByteBuf getOutBuf() { + return outBuf; + } + + @Override + public CompletableFuture writeAsync( + ByteBuffer src, WriteOption... writeOptions) { + final int written; + try { + written = writeBuffers( + ReferenceCountedObject.wrap(src, () -> { }, () -> { }), + buffers, writeMethod); + } catch (IOException e) { + return completeExceptionally(e); + } + if (WriteOption.containsOption(writeOptions, StandardWriteOption.CLOSE)) { + return closeAsync(); + } + return CompletableFuture.completedFuture( + new Reply(true, written)); + } + + @Override + public CompletableFuture closeAsync() { + final ContainerCommandRequestProto putBlockRequest; + try { + putBlockRequest = closeBuffers(buffers, writeMethod); + } catch (IOException e) { + return completeExceptionally(e); + } + return CompletableFuture.completedFuture( + new Reply(true, 0, putBlockRequest)); + } + + @Override + public CompletableFuture writeAsync( + FilePositionCount filePositionCount, WriteOption... writeOptions) { + throw new UnsupportedOperationException(); + } + + @Override + public CompletableFuture getRaftClientReplyFuture() { + throw new UnsupportedOperationException(); + } + + @Override + public WritableByteChannel getWritableByteChannel() { + throw new UnsupportedOperationException(); + } + } + + static class Reply implements DataStreamReply { + private final boolean success; + private final long bytesWritten; + private final ContainerCommandRequestProto putBlockRequest; + + Reply(boolean success, long bytesWritten) { + this(success, bytesWritten, null); + } + + Reply(boolean success, long bytesWritten, + ContainerCommandRequestProto putBlockRequest) { + this.success = success; + this.bytesWritten = bytesWritten; + this.putBlockRequest = putBlockRequest; + } + + ContainerCommandRequestProto getPutBlockRequest() { + return putBlockRequest; + } + + @Override + public boolean isSuccess() { + return success; + } + + @Override + public long getBytesWritten() { + return bytesWritten; + } + + @Override + public Collection getCommitInfos() { + throw new UnsupportedOperationException(); + } + + @Override + public ClientId getClientId() { + throw new UnsupportedOperationException(); + } + + @Override + public DataStreamPacketHeaderProto.Type getType() { + throw new UnsupportedOperationException(); + } + + @Override + public long getStreamId() { + throw new UnsupportedOperationException(); + } + + @Override + public long getStreamOffset() { + throw new UnsupportedOperationException(); + } + + @Override + public long getDataLength() { + throw new UnsupportedOperationException(); + } + } + + static CompletableFuture completeExceptionally(Throwable t) { + final CompletableFuture f = new CompletableFuture<>(); + f.completeExceptionally(t); + return f; + } +} diff --git a/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/upgrade/TestDatanodeUpgradeToSchemaV3.java b/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/upgrade/TestDatanodeUpgradeToSchemaV3.java index a7e61c5a9fd3..aa9637754825 100644 --- a/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/upgrade/TestDatanodeUpgradeToSchemaV3.java +++ b/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/upgrade/TestDatanodeUpgradeToSchemaV3.java @@ -105,6 +105,8 @@ public TestDatanodeUpgradeToSchemaV3(Boolean enable) { conf = new OzoneConfiguration(); conf.setBoolean(DatanodeConfiguration.CONTAINER_SCHEMA_V3_ENABLED, this.schemaV3Enabled); + conf.setBoolean( + OzoneConfigKeys.DFS_CONTAINER_RATIS_DATASTREAM_RANDOM_PORT, true); } @Before diff --git a/hadoop-hdds/framework/src/main/java/org/apache/hadoop/hdds/conf/DatanodeRatisServerConfig.java b/hadoop-hdds/framework/src/main/java/org/apache/hadoop/hdds/conf/DatanodeRatisServerConfig.java index 25ed4776b7d8..058932e76902 100644 --- a/hadoop-hdds/framework/src/main/java/org/apache/hadoop/hdds/conf/DatanodeRatisServerConfig.java +++ b/hadoop-hdds/framework/src/main/java/org/apache/hadoop/hdds/conf/DatanodeRatisServerConfig.java @@ -23,6 +23,7 @@ import java.time.Duration; import static org.apache.hadoop.hdds.conf.ConfigTag.DATANODE; +import static org.apache.hadoop.hdds.conf.ConfigTag.DATASTREAM; import static org.apache.hadoop.hdds.conf.ConfigTag.OZONE; import static org.apache.hadoop.hdds.conf.ConfigTag.PERFORMANCE; import static org.apache.hadoop.hdds.conf.ConfigTag.RATIS; @@ -123,6 +124,40 @@ public void setLeaderNumPendingRequests(int leaderNumPendingRequests) { this.leaderNumPendingRequests = leaderNumPendingRequests; } + @Config(key = "datastream.request.threads", + defaultValue = "20", + type = ConfigType.INT, + tags = {OZONE, DATANODE, RATIS, DATASTREAM}, + description = "Maximum number of threads in the thread pool for " + + "datastream request." + ) + private int streamRequestThreads; + + public int getStreamRequestThreads() { + return streamRequestThreads; + } + + public void setStreamRequestThreads(int streamRequestThreads) { + this.streamRequestThreads = streamRequestThreads; + } + + @Config(key = "datastream.client.pool.size", + defaultValue = "10", + type = ConfigType.INT, + tags = {OZONE, DATANODE, RATIS, DATASTREAM}, + description = "Maximum number of client proxy in NettyServerStreamRpc " + + "for datastream write." + ) + private int clientPoolSize; + + public int getClientPoolSize() { + return clientPoolSize; + } + + public void setClientPoolSize(int clientPoolSize) { + this.clientPoolSize = clientPoolSize; + } + @Config(key = "delete.ratis.log.directory", defaultValue = "true", type = ConfigType.BOOLEAN, diff --git a/hadoop-hdds/interface-client/src/main/proto/DatanodeClientProtocol.proto b/hadoop-hdds/interface-client/src/main/proto/DatanodeClientProtocol.proto index c16059c5c43c..d19f466a971f 100644 --- a/hadoop-hdds/interface-client/src/main/proto/DatanodeClientProtocol.proto +++ b/hadoop-hdds/interface-client/src/main/proto/DatanodeClientProtocol.proto @@ -100,6 +100,9 @@ enum Type { GetSmallFile = 16; CloseContainer = 17; GetCommittedBlockLength = 18; + + StreamInit = 19; + StreamWrite = 20; } @@ -400,7 +403,7 @@ enum ChecksumType { message WriteChunkRequestProto { required DatanodeBlockID blockID = 1; - required ChunkInfo chunkData = 2; + optional ChunkInfo chunkData = 2; optional bytes data = 3; } diff --git a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/ozone/container/common/TestEndPoint.java b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/ozone/container/common/TestEndPoint.java index 0230109fa47f..56a04de02ccc 100644 --- a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/ozone/container/common/TestEndPoint.java +++ b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/ozone/container/common/TestEndPoint.java @@ -144,6 +144,8 @@ public void testGetVersionTask() throws Exception { try (EndpointStateMachine rpcEndPoint = createEndpoint(conf, serverAddress, 1000)) { DatanodeDetails datanodeDetails = randomDatanodeDetails(); + conf.setBoolean( + OzoneConfigKeys.DFS_CONTAINER_RATIS_DATASTREAM_RANDOM_PORT, true); OzoneContainer ozoneContainer = new OzoneContainer( datanodeDetails, conf, getContext(datanodeDetails), null); rpcEndPoint.setState(EndpointStateMachine.EndPointStates.GETVERSION); @@ -168,6 +170,8 @@ public void testCheckVersionResponse() throws Exception { true); conf.setBoolean(OzoneConfigKeys.DFS_CONTAINER_RATIS_IPC_RANDOM_PORT, true); + conf.setBoolean( + OzoneConfigKeys.DFS_CONTAINER_RATIS_DATASTREAM_RANDOM_PORT, true); conf.setFromObject(new ReplicationConfig().setPort(0)); try (EndpointStateMachine rpcEndPoint = createEndpoint(conf, serverAddress, 1000)) { diff --git a/hadoop-ozone/client/src/main/java/org/apache/hadoop/ozone/client/OzoneBucket.java b/hadoop-ozone/client/src/main/java/org/apache/hadoop/ozone/client/OzoneBucket.java index a7c3f76de14b..a0e61d43dca7 100644 --- a/hadoop-ozone/client/src/main/java/org/apache/hadoop/ozone/client/OzoneBucket.java +++ b/hadoop-ozone/client/src/main/java/org/apache/hadoop/ozone/client/OzoneBucket.java @@ -33,6 +33,7 @@ import org.apache.hadoop.hdds.client.ReplicationType; import org.apache.hadoop.hdds.scm.client.HddsClientUtils; import org.apache.hadoop.ozone.OmUtils; +import org.apache.hadoop.ozone.client.io.OzoneDataStreamOutput; import org.apache.hadoop.ozone.client.io.OzoneInputStream; import org.apache.hadoop.ozone.client.io.OzoneOutputStream; import org.apache.hadoop.ozone.client.protocol.ClientProtocol; @@ -599,6 +600,24 @@ public OzoneOutputStream createKey(String key, long size, .createKey(volumeName, name, key, size, replicationConfig, keyMetadata); } + /** + * Creates a new key in the bucket. + * + * @param key Name of the key to be created. + * @param size Size of the data the key will point to. + * @param replicationConfig Replication configuration. + * @return OzoneDataStreamOutput to which the data has to be written. + * @throws IOException + */ + public OzoneDataStreamOutput createStreamKey(String key, long size, + ReplicationConfig replicationConfig, + Map keyMetadata) + throws IOException { + return proxy + .createStreamKey(volumeName, name, key, size, replicationConfig, + keyMetadata); + } + /** * Reads an existing key from the bucket. * @@ -791,6 +810,21 @@ public OzoneOutputStream createMultipartKey(String key, long size, uploadID); } + /** + * Create a part key for a multipart upload key. + * @param key + * @param size + * @param partNumber + * @param uploadID + * @return OzoneDataStreamOutput + * @throws IOException + */ + public OzoneDataStreamOutput createMultipartStreamKey(String key, + long size, int partNumber, String uploadID) throws IOException { + return proxy.createMultipartStreamKey(volumeName, name, + key, size, partNumber, uploadID); + } + /** * Complete Multipart upload. This will combine all the parts and make the * key visible in ozone. @@ -921,6 +955,14 @@ public OzoneOutputStream createFile(String keyName, long size, overWrite, recursive); } + public OzoneDataStreamOutput createStreamFile(String keyName, long size, + ReplicationConfig replicationConfig, boolean overWrite, + boolean recursive) throws IOException { + return proxy + .createStreamFile(volumeName, name, keyName, size, replicationConfig, + overWrite, recursive); + } + /** * List the status for a file or a directory and its contents. * diff --git a/hadoop-ozone/client/src/main/java/org/apache/hadoop/ozone/client/io/BlockDataStreamOutputEntry.java b/hadoop-ozone/client/src/main/java/org/apache/hadoop/ozone/client/io/BlockDataStreamOutputEntry.java new file mode 100644 index 000000000000..4e5a35a539ce --- /dev/null +++ b/hadoop-ozone/client/src/main/java/org/apache/hadoop/ozone/client/io/BlockDataStreamOutputEntry.java @@ -0,0 +1,287 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.ozone.client.io; + +import com.google.common.annotations.VisibleForTesting; +import org.apache.hadoop.hdds.client.BlockID; +import org.apache.hadoop.hdds.protocol.DatanodeDetails; +import org.apache.hadoop.hdds.scm.OzoneClientConfig; +import org.apache.hadoop.hdds.scm.XceiverClientFactory; +import org.apache.hadoop.hdds.scm.pipeline.Pipeline; +import org.apache.hadoop.hdds.scm.storage.BlockDataStreamOutput; +import org.apache.hadoop.hdds.scm.storage.ByteBufferStreamOutput; +import org.apache.hadoop.hdds.scm.storage.StreamBuffer; +import org.apache.hadoop.hdds.security.token.OzoneBlockTokenIdentifier; +import org.apache.hadoop.security.token.Token; + +import java.io.IOException; +import java.nio.ByteBuffer; +import java.util.Collection; +import java.util.Collections; +import java.util.List; + +/** + * Helper class used inside {@link BlockDataStreamOutput}. + * */ +public final class BlockDataStreamOutputEntry + implements ByteBufferStreamOutput { + + private final OzoneClientConfig config; + private ByteBufferStreamOutput byteBufferStreamOutput; + private BlockID blockID; + private final String key; + private final XceiverClientFactory xceiverClientManager; + private final Pipeline pipeline; + // total number of bytes that should be written to this stream + private final long length; + // the current position of this stream 0 <= currentPosition < length + private long currentPosition; + private final Token token; + private List bufferList; + + @SuppressWarnings({"parameternumber", "squid:S00107"}) + private BlockDataStreamOutputEntry( + BlockID blockID, String key, + XceiverClientFactory xceiverClientManager, + Pipeline pipeline, + long length, + Token token, + OzoneClientConfig config, + List bufferList + ) { + this.config = config; + this.byteBufferStreamOutput = null; + this.blockID = blockID; + this.key = key; + this.xceiverClientManager = xceiverClientManager; + this.pipeline = pipeline; + this.token = token; + this.length = length; + this.currentPosition = 0; + this.bufferList = bufferList; + } + + long getLength() { + return length; + } + + Token getToken() { + return token; + } + + long getRemaining() { + return length - currentPosition; + } + + /** + * BlockDataStreamOutput is initialized in this function. This makes sure that + * xceiverClient initialization is not done during preallocation and only + * done when data is written. + * @throws IOException if xceiverClient initialization fails + */ + private void checkStream() throws IOException { + if (this.byteBufferStreamOutput == null) { + this.byteBufferStreamOutput = + new BlockDataStreamOutput(blockID, xceiverClientManager, pipeline, + config, token, bufferList); + } + } + + @Override + public void write(ByteBuffer b, int off, int len) throws IOException { + checkStream(); + byteBufferStreamOutput.write(b, off, len); + this.currentPosition += len; + } + + @Override + public void flush() throws IOException { + if (this.byteBufferStreamOutput != null) { + this.byteBufferStreamOutput.flush(); + } + } + + @Override + public void close() throws IOException { + if (this.byteBufferStreamOutput != null) { + this.byteBufferStreamOutput.close(); + // after closing the chunkOutPutStream, blockId would have been + // reconstructed with updated bcsId + this.blockID = + ((BlockDataStreamOutput) byteBufferStreamOutput).getBlockID(); + } + } + + boolean isClosed() { + if (byteBufferStreamOutput != null) { + return ((BlockDataStreamOutput) byteBufferStreamOutput).isClosed(); + } + return false; + } + + Collection getFailedServers() { + if (byteBufferStreamOutput != null) { + BlockDataStreamOutput out = + (BlockDataStreamOutput) this.byteBufferStreamOutput; + return out.getFailedServers(); + } + return Collections.emptyList(); + } + + long getWrittenDataLength() { + if (byteBufferStreamOutput != null) { + BlockDataStreamOutput out = + (BlockDataStreamOutput) this.byteBufferStreamOutput; + return out.getWrittenDataLength(); + } else { + // For a pre allocated block for which no write has been initiated, + // the ByteBufferStreamOutput will be null here. + // In such cases, the default blockCommitSequenceId will be 0 + return 0; + } + } + + public long getTotalAckDataLength() { + if (byteBufferStreamOutput != null) { + BlockDataStreamOutput out = + (BlockDataStreamOutput) this.byteBufferStreamOutput; + blockID = out.getBlockID(); + return out.getTotalAckDataLength(); + } else { + // For a pre allocated block for which no write has been initiated, + // the OutputStream will be null here. + // In such cases, the default blockCommitSequenceId will be 0 + return 0; + } + } + + void cleanup(boolean invalidateClient) throws IOException { + checkStream(); + BlockDataStreamOutput out = + (BlockDataStreamOutput) this.byteBufferStreamOutput; + out.cleanup(invalidateClient); + + } + + void writeOnRetry(long len) throws IOException { + checkStream(); + BlockDataStreamOutput out = + (BlockDataStreamOutput) this.byteBufferStreamOutput; + out.writeOnRetry(len); + this.currentPosition += len; + + } + + /** + * Builder class for BlockDataStreamOutputEntry. + * */ + public static class Builder { + + private BlockID blockID; + private String key; + private XceiverClientFactory xceiverClientManager; + private Pipeline pipeline; + private long length; + private Token token; + private OzoneClientConfig config; + private List bufferList; + + public Builder setBlockID(BlockID bID) { + this.blockID = bID; + return this; + } + + public Builder setKey(String keys) { + this.key = keys; + return this; + } + + public Builder setXceiverClientManager( + XceiverClientFactory + xClientManager) { + this.xceiverClientManager = xClientManager; + return this; + } + + public Builder setPipeline(Pipeline ppln) { + this.pipeline = ppln; + return this; + } + + + public Builder setLength(long len) { + this.length = len; + return this; + } + + public Builder setConfig(OzoneClientConfig clientConfig) { + this.config = clientConfig; + return this; + } + + public Builder setToken(Token bToken) { + this.token = bToken; + return this; + } + + public Builder setBufferList(List bList) { + this.bufferList = bList; + return this; + } + + public BlockDataStreamOutputEntry build() { + return new BlockDataStreamOutputEntry(blockID, + key, + xceiverClientManager, + pipeline, + length, + token, config, bufferList); + } + } + + @VisibleForTesting + public ByteBufferStreamOutput getByteBufStreamOutput() { + return byteBufferStreamOutput; + } + + public BlockID getBlockID() { + return blockID; + } + + public String getKey() { + return key; + } + + public XceiverClientFactory getXceiverClientManager() { + return xceiverClientManager; + } + + public Pipeline getPipeline() { + return pipeline; + } + + public long getCurrentPosition() { + return currentPosition; + } + + public void setCurrentPosition(long curPosition) { + this.currentPosition = curPosition; + } +} + + diff --git a/hadoop-ozone/client/src/main/java/org/apache/hadoop/ozone/client/io/BlockDataStreamOutputEntryPool.java b/hadoop-ozone/client/src/main/java/org/apache/hadoop/ozone/client/io/BlockDataStreamOutputEntryPool.java new file mode 100644 index 000000000000..e51242cc107b --- /dev/null +++ b/hadoop-ozone/client/src/main/java/org/apache/hadoop/ozone/client/io/BlockDataStreamOutputEntryPool.java @@ -0,0 +1,290 @@ + +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.ozone.client.io; + +import com.google.common.base.Preconditions; +import org.apache.hadoop.hdds.client.ReplicationConfig; +import org.apache.hadoop.hdds.scm.OzoneClientConfig; +import org.apache.hadoop.hdds.scm.XceiverClientFactory; +import org.apache.hadoop.hdds.scm.container.common.helpers.ExcludeList; +import org.apache.hadoop.hdds.scm.pipeline.PipelineID; +import org.apache.hadoop.hdds.scm.storage.StreamBuffer; +import org.apache.hadoop.ozone.om.helpers.OmKeyArgs; +import org.apache.hadoop.ozone.om.helpers.OmKeyInfo; +import org.apache.hadoop.ozone.om.helpers.OmKeyLocationInfo; +import org.apache.hadoop.ozone.om.helpers.OmKeyLocationInfoGroup; +import org.apache.hadoop.ozone.om.helpers.OmMultipartCommitUploadPartInfo; +import org.apache.hadoop.ozone.om.protocol.OzoneManagerProtocol; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; +import java.util.ListIterator; + +/** + * This class manages the stream entries list and handles block allocation + * from OzoneManager. + */ +public class BlockDataStreamOutputEntryPool { + + public static final Logger LOG = + LoggerFactory.getLogger(BlockDataStreamOutputEntryPool.class); + + private final List streamEntries; + private final OzoneClientConfig config; + private int currentStreamIndex; + private final OzoneManagerProtocol omClient; + private final OmKeyArgs keyArgs; + private final XceiverClientFactory xceiverClientFactory; + private final String requestID; + private OmMultipartCommitUploadPartInfo commitUploadPartInfo; + private final long openID; + private final ExcludeList excludeList; + private List bufferList; + + @SuppressWarnings({"parameternumber", "squid:S00107"}) + public BlockDataStreamOutputEntryPool( + OzoneClientConfig config, + OzoneManagerProtocol omClient, + String requestId, ReplicationConfig replicationConfig, + String uploadID, int partNumber, + boolean isMultipart, OmKeyInfo info, + boolean unsafeByteBufferConversion, + XceiverClientFactory xceiverClientFactory, long openID + ) { + this.config = config; + this.xceiverClientFactory = xceiverClientFactory; + streamEntries = new ArrayList<>(); + currentStreamIndex = 0; + this.omClient = omClient; + this.keyArgs = new OmKeyArgs.Builder().setVolumeName(info.getVolumeName()) + .setBucketName(info.getBucketName()).setKeyName(info.getKeyName()) + .setReplicationConfig(replicationConfig).setDataSize(info.getDataSize()) + .setIsMultipartKey(isMultipart).setMultipartUploadID(uploadID) + .setMultipartUploadPartNumber(partNumber).build(); + this.requestID = requestId; + this.openID = openID; + this.excludeList = new ExcludeList(); + this.bufferList = new ArrayList<>(); + } + + /** + * When a key is opened, it is possible that there are some blocks already + * allocated to it for this open session. In this case, to make use of these + * blocks, we need to add these blocks to stream entries. But, a key's version + * also includes blocks from previous versions, we need to avoid adding these + * old blocks to stream entries, because these old blocks should not be picked + * for write. To do this, the following method checks that, only those + * blocks created in this particular open version are added to stream entries. + * + * @param version the set of blocks that are pre-allocated. + * @param openVersion the version corresponding to the pre-allocation. + * @throws IOException + */ + public void addPreallocateBlocks(OmKeyLocationInfoGroup version, + long openVersion) throws IOException { + // server may return any number of blocks, (0 to any) + // only the blocks allocated in this open session (block createVersion + // equals to open session version) + for (OmKeyLocationInfo subKeyInfo : version.getLocationList(openVersion)) { + addKeyLocationInfo(subKeyInfo); + } + } + + private void addKeyLocationInfo(OmKeyLocationInfo subKeyInfo) { + Preconditions.checkNotNull(subKeyInfo.getPipeline()); + BlockDataStreamOutputEntry.Builder builder = + new BlockDataStreamOutputEntry.Builder() + .setBlockID(subKeyInfo.getBlockID()) + .setKey(keyArgs.getKeyName()) + .setXceiverClientManager(xceiverClientFactory) + .setPipeline(subKeyInfo.getPipeline()) + .setConfig(config) + .setLength(subKeyInfo.getLength()) + .setToken(subKeyInfo.getToken()) + .setBufferList(bufferList); + streamEntries.add(builder.build()); + } + + public List getLocationInfoList() { + List locationInfoList = new ArrayList<>(); + for (BlockDataStreamOutputEntry streamEntry : streamEntries) { + long length = streamEntry.getCurrentPosition(); + + // Commit only those blocks to OzoneManager which are not empty + if (length != 0) { + OmKeyLocationInfo info = + new OmKeyLocationInfo.Builder().setBlockID(streamEntry.getBlockID()) + .setLength(streamEntry.getCurrentPosition()).setOffset(0) + .setToken(streamEntry.getToken()) + .setPipeline(streamEntry.getPipeline()).build(); + locationInfoList.add(info); + } + if (LOG.isDebugEnabled()) { + LOG.debug( + "block written " + streamEntry.getBlockID() + ", length " + length + + " bcsID " + streamEntry.getBlockID() + .getBlockCommitSequenceId()); + } + } + return locationInfoList; + } + + /** + * Discards the subsequent pre allocated blocks and removes the streamEntries + * from the streamEntries list for the container which is closed. + * @param containerID id of the closed container + * @param pipelineId id of the associated pipeline + */ + void discardPreallocatedBlocks(long containerID, PipelineID pipelineId) { + // currentStreamIndex < streamEntries.size() signifies that, there are still + // pre allocated blocks available. + + // This will be called only to discard the next subsequent unused blocks + // in the streamEntryList. + if (currentStreamIndex + 1 < streamEntries.size()) { + ListIterator streamEntryIterator = + streamEntries.listIterator(currentStreamIndex + 1); + while (streamEntryIterator.hasNext()) { + BlockDataStreamOutputEntry streamEntry = streamEntryIterator.next(); + Preconditions.checkArgument(streamEntry.getCurrentPosition() == 0); + if ((streamEntry.getPipeline().getId().equals(pipelineId)) || + (containerID != -1 && + streamEntry.getBlockID().getContainerID() == containerID)) { + streamEntryIterator.remove(); + } + } + } + } + + List getStreamEntries() { + return streamEntries; + } + + XceiverClientFactory getXceiverClientFactory() { + return xceiverClientFactory; + } + + String getKeyName() { + return keyArgs.getKeyName(); + } + + long getKeyLength() { + return streamEntries.stream().mapToLong( + BlockDataStreamOutputEntry::getCurrentPosition).sum(); + } + /** + * Contact OM to get a new block. Set the new block with the index (e.g. + * first block has index = 0, second has index = 1 etc.) + * + * The returned block is made to new BlockDataStreamOutputEntry to write. + * + * @throws IOException + */ + private void allocateNewBlock() throws IOException { + if (!excludeList.isEmpty()) { + LOG.debug("Allocating block with {}", excludeList); + } + OmKeyLocationInfo subKeyInfo = + omClient.allocateBlock(keyArgs, openID, excludeList); + addKeyLocationInfo(subKeyInfo); + } + + + void commitKey(long offset) throws IOException { + if (keyArgs != null) { + // in test, this could be null + long length = getKeyLength(); + Preconditions.checkArgument(offset == length); + keyArgs.setDataSize(length); + keyArgs.setLocationInfoList(getLocationInfoList()); + // When the key is multipart upload part file upload, we should not + // commit the key, as this is not an actual key, this is a just a + // partial key of a large file. + if (keyArgs.getIsMultipartKey()) { + commitUploadPartInfo = + omClient.commitMultipartUploadPart(keyArgs, openID); + } else { + omClient.commitKey(keyArgs, openID); + } + } else { + LOG.warn("Closing KeyDataStreamOutput, but key args is null"); + } + } + + public BlockDataStreamOutputEntry getCurrentStreamEntry() { + if (streamEntries.isEmpty() || streamEntries.size() <= currentStreamIndex) { + return null; + } else { + return streamEntries.get(currentStreamIndex); + } + } + + BlockDataStreamOutputEntry allocateBlockIfNeeded() throws IOException { + BlockDataStreamOutputEntry streamEntry = getCurrentStreamEntry(); + if (streamEntry != null && streamEntry.isClosed()) { + // a stream entry gets closed either by : + // a. If the stream gets full + // b. it has encountered an exception + currentStreamIndex++; + } + if (streamEntries.size() <= currentStreamIndex) { + Preconditions.checkNotNull(omClient); + // allocate a new block, if a exception happens, log an error and + // throw exception to the caller directly, and the write fails. + allocateNewBlock(); + } + // in theory, this condition should never violate due the check above + // still do a sanity check. + Preconditions.checkArgument(currentStreamIndex < streamEntries.size()); + return streamEntries.get(currentStreamIndex); + } + + void cleanup() { + if (excludeList != null) { + excludeList.clear(); + } + + if (streamEntries != null) { + streamEntries.clear(); + } + } + + public OmMultipartCommitUploadPartInfo getCommitUploadPartInfo() { + return commitUploadPartInfo; + } + + public ExcludeList getExcludeList() { + return excludeList; + } + + boolean isEmpty() { + return streamEntries.isEmpty(); + } + + long computeBufferData() { + long totalDataLen = 0; + for (StreamBuffer b : bufferList) { + totalDataLen += b.position(); + } + return totalDataLen; + } +} diff --git a/hadoop-ozone/client/src/main/java/org/apache/hadoop/ozone/client/io/KeyDataStreamOutput.java b/hadoop-ozone/client/src/main/java/org/apache/hadoop/ozone/client/io/KeyDataStreamOutput.java new file mode 100644 index 000000000000..dc5c3a016d70 --- /dev/null +++ b/hadoop-ozone/client/src/main/java/org/apache/hadoop/ozone/client/io/KeyDataStreamOutput.java @@ -0,0 +1,510 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.ozone.client.io; + +import com.google.common.annotations.VisibleForTesting; +import com.google.common.base.Preconditions; +import org.apache.hadoop.fs.FSExceptionMessages; +import org.apache.hadoop.fs.FileEncryptionInfo; +import org.apache.hadoop.hdds.client.ReplicationConfig; +import org.apache.hadoop.hdds.protocol.DatanodeDetails; +import org.apache.hadoop.hdds.scm.OzoneClientConfig; +import org.apache.hadoop.hdds.scm.XceiverClientFactory; +import org.apache.hadoop.hdds.scm.client.HddsClientUtils; +import org.apache.hadoop.hdds.scm.container.ContainerID; +import org.apache.hadoop.hdds.scm.container.common.helpers.ExcludeList; +import org.apache.hadoop.hdds.scm.pipeline.Pipeline; +import org.apache.hadoop.hdds.scm.pipeline.PipelineID; +import org.apache.hadoop.hdds.scm.storage.AbstractDataStreamOutput; +import org.apache.hadoop.ozone.om.helpers.OmKeyInfo; +import org.apache.hadoop.ozone.om.helpers.OmKeyLocationInfo; +import org.apache.hadoop.ozone.om.helpers.OmKeyLocationInfoGroup; +import org.apache.hadoop.ozone.om.helpers.OmMultipartCommitUploadPartInfo; +import org.apache.hadoop.ozone.om.helpers.OpenKeySession; +import org.apache.hadoop.ozone.om.protocol.OzoneManagerProtocol; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.nio.ByteBuffer; +import java.util.Collection; +import java.util.List; + +/** + * Maintaining a list of BlockInputStream. Write based on offset. + * + * Note that this may write to multiple containers in one write call. In case + * that first container succeeded but later ones failed, the succeeded writes + * are not rolled back. + * + * TODO : currently not support multi-thread access. + */ +public class KeyDataStreamOutput extends AbstractDataStreamOutput { + + private OzoneClientConfig config; + + /** + * Defines stream action while calling handleFlushOrClose. + */ + enum StreamAction { + FLUSH, CLOSE, FULL + } + + public static final Logger LOG = + LoggerFactory.getLogger(KeyDataStreamOutput.class); + + private boolean closed; + private FileEncryptionInfo feInfo; + + // how much of data is actually written yet to underlying stream + private long offset; + // how much data has been ingested into the stream + private long writeOffset; + + private final BlockDataStreamOutputEntryPool blockDataStreamOutputEntryPool; + + private long clientID; + + @VisibleForTesting + public List getStreamEntries() { + return blockDataStreamOutputEntryPool.getStreamEntries(); + } + + @VisibleForTesting + public XceiverClientFactory getXceiverClientFactory() { + return blockDataStreamOutputEntryPool.getXceiverClientFactory(); + } + + @VisibleForTesting + public List getLocationInfoList() { + return blockDataStreamOutputEntryPool.getLocationInfoList(); + } + + @VisibleForTesting + public long getClientID() { + return clientID; + } + + @SuppressWarnings({"parameternumber", "squid:S00107"}) + public KeyDataStreamOutput( + OzoneClientConfig config, + OpenKeySession handler, + XceiverClientFactory xceiverClientManager, + OzoneManagerProtocol omClient, int chunkSize, + String requestId, ReplicationConfig replicationConfig, + String uploadID, int partNumber, boolean isMultipart, + boolean unsafeByteBufferConversion + ) { + super(HddsClientUtils.getRetryPolicyByException( + config.getMaxRetryCount(), config.getRetryInterval())); + this.config = config; + OmKeyInfo info = handler.getKeyInfo(); + blockDataStreamOutputEntryPool = + new BlockDataStreamOutputEntryPool( + config, + omClient, + requestId, replicationConfig, + uploadID, partNumber, + isMultipart, info, + unsafeByteBufferConversion, + xceiverClientManager, + handler.getId()); + + // Retrieve the file encryption key info, null if file is not in + // encrypted bucket. + this.feInfo = info.getFileEncryptionInfo(); + this.writeOffset = 0; + this.clientID = handler.getId(); + } + + /** + * When a key is opened, it is possible that there are some blocks already + * allocated to it for this open session. In this case, to make use of these + * blocks, we need to add these blocks to stream entries. But, a key's version + * also includes blocks from previous versions, we need to avoid adding these + * old blocks to stream entries, because these old blocks should not be picked + * for write. To do this, the following method checks that, only those + * blocks created in this particular open version are added to stream entries. + * + * @param version the set of blocks that are pre-allocated. + * @param openVersion the version corresponding to the pre-allocation. + * @throws IOException + */ + public void addPreallocateBlocks(OmKeyLocationInfoGroup version, + long openVersion) throws IOException { + blockDataStreamOutputEntryPool.addPreallocateBlocks(version, openVersion); + } + + @Override + public void write(ByteBuffer b, int off, int len) throws IOException { + checkNotClosed(); + if (b == null) { + throw new NullPointerException(); + } + handleWrite(b, off, len, false); + writeOffset += len; + } + + private void handleWrite(ByteBuffer b, int off, long len, boolean retry) + throws IOException { + while (len > 0) { + try { + BlockDataStreamOutputEntry current = + blockDataStreamOutputEntryPool.allocateBlockIfNeeded(); + // length(len) will be in int range if the call is happening through + // write API of blockDataStreamOutput. Length can be in long range + // if it comes via Exception path. + int expectedWriteLen = Math.min((int) len, + (int) current.getRemaining()); + long currentPos = current.getWrittenDataLength(); + // writeLen will be updated based on whether the write was succeeded + // or if it sees an exception, how much the actual write was + // acknowledged. + int writtenLength = + writeToDataStreamOutput(current, retry, len, b, + expectedWriteLen, off, currentPos); + if (current.getRemaining() <= 0) { + // since the current block is already written close the stream. + handleFlushOrClose(StreamAction.FULL); + } + len -= writtenLength; + off += writtenLength; + } catch (Exception e) { + markStreamClosed(); + throw new IOException(e); + } + } + } + + private int writeToDataStreamOutput(BlockDataStreamOutputEntry current, + boolean retry, long len, ByteBuffer b, int writeLen, int off, + long currentPos) throws IOException { + try { + if (retry) { + current.writeOnRetry(len); + } else { + current.write(b, off, writeLen); + offset += writeLen; + } + } catch (IOException ioe) { + // for the current iteration, totalDataWritten - currentPos gives the + // amount of data already written to the buffer + + // In the retryPath, the total data to be written will always be equal + // to or less than the max length of the buffer allocated. + // The len specified here is the combined sum of the data length of + // the buffers + Preconditions.checkState(!retry || len <= config + .getStreamBufferMaxSize()); + int dataWritten = (int) (current.getWrittenDataLength() - currentPos); + writeLen = retry ? (int) len : dataWritten; + // In retry path, the data written is already accounted in offset. + if (!retry) { + offset += writeLen; + } + LOG.debug("writeLen {}, total len {}", writeLen, len); + handleException(current, ioe); + } + return writeLen; + } + + /** + * It performs following actions : + * a. Updates the committed length at datanode for the current stream in + * datanode. + * b. Reads the data from the underlying buffer and writes it the next stream. + * + * @param streamEntry StreamEntry + * @param exception actual exception that occurred + * @throws IOException Throws IOException if Write fails + */ + private void handleException(BlockDataStreamOutputEntry streamEntry, + IOException exception) throws IOException { + Throwable t = HddsClientUtils.checkForException(exception); + Preconditions.checkNotNull(t); + boolean retryFailure = checkForRetryFailure(t); + boolean containerExclusionException = false; + if (!retryFailure) { + containerExclusionException = checkIfContainerToExclude(t); + } + Pipeline pipeline = streamEntry.getPipeline(); + PipelineID pipelineId = pipeline.getId(); + long totalSuccessfulFlushedData = streamEntry.getTotalAckDataLength(); + //set the correct length for the current stream + streamEntry.setCurrentPosition(totalSuccessfulFlushedData); + long containerId = streamEntry.getBlockID().getContainerID(); + Collection failedServers = streamEntry.getFailedServers(); + Preconditions.checkNotNull(failedServers); + ExcludeList excludeList = blockDataStreamOutputEntryPool.getExcludeList(); + long bufferedDataLen = blockDataStreamOutputEntryPool.computeBufferData(); + if (!failedServers.isEmpty()) { + excludeList.addDatanodes(failedServers); + } + + // if the container needs to be excluded , add the container to the + // exclusion list , otherwise add the pipeline to the exclusion list + if (containerExclusionException) { + excludeList.addConatinerId(ContainerID.valueOf(containerId)); + } else { + excludeList.addPipeline(pipelineId); + } + // just clean up the current stream. + streamEntry.cleanup(retryFailure); + + // discard all subsequent blocks the containers and pipelines which + // are in the exclude list so that, the very next retry should never + // write data on the closed container/pipeline + if (containerExclusionException) { + // discard subsequent pre allocated blocks from the streamEntries list + // from the closed container + blockDataStreamOutputEntryPool + .discardPreallocatedBlocks(streamEntry.getBlockID().getContainerID(), + null); + } else { + // In case there is timeoutException or Watch for commit happening over + // majority or the client connection failure to the leader in the + // pipeline, just discard all the pre allocated blocks on this pipeline. + // Next block allocation will happen with excluding this specific pipeline + // This will ensure if 2 way commit happens , it cannot span over multiple + // blocks + blockDataStreamOutputEntryPool + .discardPreallocatedBlocks(-1, pipelineId); + } + if (bufferedDataLen > 0) { + // If the data is still cached in the underlying stream, we need to + // allocate new block and write this data in the datanode. + handleRetry(exception); + handleWrite(null, 0, bufferedDataLen, true); + // reset the retryCount after handling the exception + resetRetryCount(); + } + } + + private void markStreamClosed() { + blockDataStreamOutputEntryPool.cleanup(); + closed = true; + } + + @Override + public void flush() throws IOException { + checkNotClosed(); + handleFlushOrClose(StreamAction.FLUSH); + } + + /** + * Close or Flush the latest outputStream depending upon the action. + * This function gets called when while write is going on, the current stream + * gets full or explicit flush or close request is made by client. when the + * stream gets full and we try to close the stream , we might end up hitting + * an exception in the exception handling path, we write the data residing in + * in the buffer pool to a new Block. In cases, as such, when the data gets + * written to new stream , it will be at max half full. In such cases, we + * should just write the data and not close the stream as the block won't be + * completely full. + * + * @param op Flag which decides whether to call close or flush on the + * outputStream. + * @throws IOException In case, flush or close fails with exception. + */ + @SuppressWarnings("squid:S1141") + private void handleFlushOrClose(StreamAction op) throws IOException { + if (!blockDataStreamOutputEntryPool.isEmpty()) { + while (true) { + try { + BlockDataStreamOutputEntry entry = + blockDataStreamOutputEntryPool.getCurrentStreamEntry(); + if (entry != null) { + try { + handleStreamAction(entry, op); + } catch (IOException ioe) { + handleException(entry, ioe); + continue; + } + } + return; + } catch (Exception e) { + markStreamClosed(); + throw e; + } + } + } + } + + private void handleStreamAction(BlockDataStreamOutputEntry entry, + StreamAction op) throws IOException { + Collection failedServers = entry.getFailedServers(); + // failed servers can be null in case there is no data written in + // the stream + if (!failedServers.isEmpty()) { + blockDataStreamOutputEntryPool.getExcludeList().addDatanodes( + failedServers); + } + switch (op) { + case CLOSE: + entry.close(); + break; + case FULL: + if (entry.getRemaining() == 0) { + entry.close(); + } + break; + case FLUSH: + entry.flush(); + break; + default: + throw new IOException("Invalid Operation"); + } + } + + /** + * Commit the key to OM, this will add the blocks as the new key blocks. + * + * @throws IOException + */ + @Override + public void close() throws IOException { + if (closed) { + return; + } + closed = true; + try { + handleFlushOrClose(StreamAction.CLOSE); + if (!isException()) { + Preconditions.checkArgument(writeOffset == offset); + } + blockDataStreamOutputEntryPool.commitKey(offset); + } finally { + blockDataStreamOutputEntryPool.cleanup(); + } + } + + public OmMultipartCommitUploadPartInfo getCommitUploadPartInfo() { + return blockDataStreamOutputEntryPool.getCommitUploadPartInfo(); + } + + public FileEncryptionInfo getFileEncryptionInfo() { + return feInfo; + } + + @VisibleForTesting + public ExcludeList getExcludeList() { + return blockDataStreamOutputEntryPool.getExcludeList(); + } + + /** + * Builder class of KeyDataStreamOutput. + */ + public static class Builder { + private OpenKeySession openHandler; + private XceiverClientFactory xceiverManager; + private OzoneManagerProtocol omClient; + private int chunkSize; + private String requestID; + private String multipartUploadID; + private int multipartNumber; + private boolean isMultipartKey; + private boolean unsafeByteBufferConversion; + private OzoneClientConfig clientConfig; + private ReplicationConfig replicationConfig; + + public Builder setMultipartUploadID(String uploadID) { + this.multipartUploadID = uploadID; + return this; + } + + public Builder setMultipartNumber(int partNumber) { + this.multipartNumber = partNumber; + return this; + } + + public Builder setHandler(OpenKeySession handler) { + this.openHandler = handler; + return this; + } + + public Builder setXceiverClientManager(XceiverClientFactory manager) { + this.xceiverManager = manager; + return this; + } + + public Builder setOmClient(OzoneManagerProtocol client) { + this.omClient = client; + return this; + } + + public Builder setChunkSize(int size) { + this.chunkSize = size; + return this; + } + + public Builder setRequestID(String id) { + this.requestID = id; + return this; + } + + public Builder setIsMultipartKey(boolean isMultipart) { + this.isMultipartKey = isMultipart; + return this; + } + + public Builder setConfig(OzoneClientConfig config) { + this.clientConfig = config; + return this; + } + + public Builder enableUnsafeByteBufferConversion(boolean enabled) { + this.unsafeByteBufferConversion = enabled; + return this; + } + + + public Builder setReplicationConfig(ReplicationConfig replConfig) { + this.replicationConfig = replConfig; + return this; + } + + public KeyDataStreamOutput build() { + return new KeyDataStreamOutput( + clientConfig, + openHandler, + xceiverManager, + omClient, + chunkSize, + requestID, + replicationConfig, + multipartUploadID, + multipartNumber, + isMultipartKey, + unsafeByteBufferConversion); + } + + } + + /** + * Verify that the output stream is open. Non blocking; this gives + * the last state of the volatile {@link #closed} field. + * @throws IOException if the connection is closed. + */ + private void checkNotClosed() throws IOException { + if (closed) { + throw new IOException( + ": " + FSExceptionMessages.STREAM_IS_CLOSED + " Key: " + + blockDataStreamOutputEntryPool.getKeyName()); + } + } +} diff --git a/hadoop-ozone/client/src/main/java/org/apache/hadoop/ozone/client/io/OzoneDataStreamOutput.java b/hadoop-ozone/client/src/main/java/org/apache/hadoop/ozone/client/io/OzoneDataStreamOutput.java new file mode 100644 index 000000000000..d40ac2b332ef --- /dev/null +++ b/hadoop-ozone/client/src/main/java/org/apache/hadoop/ozone/client/io/OzoneDataStreamOutput.java @@ -0,0 +1,70 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with this + * work for additional information regarding copyright ownership. The ASF + * licenses this file to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + *

+ * http://www.apache.org/licenses/LICENSE-2.0 + *

+ * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + */ +package org.apache.hadoop.ozone.client.io; + +import org.apache.hadoop.hdds.scm.storage.ByteBufferStreamOutput; +import org.apache.hadoop.ozone.om.helpers.OmMultipartCommitUploadPartInfo; + +import java.io.IOException; +import java.nio.ByteBuffer; + +/** + * OzoneDataStreamOutput is used to write data into Ozone. + * It uses SCM's {@link KeyDataStreamOutput} for writing the data. + */ +public class OzoneDataStreamOutput implements ByteBufferStreamOutput { + + private final ByteBufferStreamOutput byteBufferStreamOutput; + + /** + * Constructs OzoneDataStreamOutput with KeyDataStreamOutput. + * + * @param byteBufferStreamOutput the underlying ByteBufferStreamOutput + */ + public OzoneDataStreamOutput(ByteBufferStreamOutput byteBufferStreamOutput) { + this.byteBufferStreamOutput = byteBufferStreamOutput; + } + + @Override + public void write(ByteBuffer b, int off, int len) throws IOException { + byteBufferStreamOutput.write(b, off, len); + } + + @Override + public synchronized void flush() throws IOException { + byteBufferStreamOutput.flush(); + } + + @Override + public synchronized void close() throws IOException { + //commitKey can be done here, if needed. + byteBufferStreamOutput.close(); + } + + public OmMultipartCommitUploadPartInfo getCommitUploadPartInfo() { + if (byteBufferStreamOutput instanceof KeyDataStreamOutput) { + return ((KeyDataStreamOutput) + byteBufferStreamOutput).getCommitUploadPartInfo(); + } + // Otherwise return null. + return null; + } + + public ByteBufferStreamOutput getByteBufStreamOutput() { + return byteBufferStreamOutput; + } +} diff --git a/hadoop-ozone/client/src/main/java/org/apache/hadoop/ozone/client/protocol/ClientProtocol.java b/hadoop-ozone/client/src/main/java/org/apache/hadoop/ozone/client/protocol/ClientProtocol.java index 70a04406a0ac..290c8db5ec82 100644 --- a/hadoop-ozone/client/src/main/java/org/apache/hadoop/ozone/client/protocol/ClientProtocol.java +++ b/hadoop-ozone/client/src/main/java/org/apache/hadoop/ozone/client/protocol/ClientProtocol.java @@ -40,6 +40,7 @@ import org.apache.hadoop.ozone.client.OzoneVolume; import org.apache.hadoop.ozone.client.TenantArgs; import org.apache.hadoop.ozone.client.VolumeArgs; +import org.apache.hadoop.ozone.client.io.OzoneDataStreamOutput; import org.apache.hadoop.ozone.client.io.OzoneInputStream; import org.apache.hadoop.ozone.client.io.OzoneOutputStream; import org.apache.hadoop.ozone.om.OMConfigKeys; @@ -310,6 +311,20 @@ OzoneOutputStream createKey(String volumeName, String bucketName, Map metadata) throws IOException; + /** + * Writes a key in an existing bucket. + * @param volumeName Name of the Volume + * @param bucketName Name of the Bucket + * @param keyName Name of the Key + * @param size Size of the data + * @param metadata custom key value metadata + * @return {@link OzoneDataStreamOutput} + * + */ + OzoneDataStreamOutput createStreamKey(String volumeName, String bucketName, + String keyName, long size, ReplicationConfig replicationConfig, + Map metadata) + throws IOException; /** * Reads a key from an existing bucket. @@ -479,6 +494,24 @@ OzoneOutputStream createMultipartKey(String volumeName, String bucketName, int partNumber, String uploadID) throws IOException; + /** + * Create a part key for a multipart upload key. + * @param volumeName + * @param bucketName + * @param keyName + * @param size + * @param partNumber + * @param uploadID + * @return OzoneDataStreamOutput + * @throws IOException + */ + OzoneDataStreamOutput createMultipartStreamKey(String volumeName, + String bucketName, + String keyName, long size, + int partNumber, + String uploadID) + throws IOException; + /** * Complete Multipart upload. This will combine all the parts and make the * key visible in ozone. @@ -796,6 +829,11 @@ OzoneOutputStream createFile(String volumeName, String bucketName, String keyName, long size, ReplicationConfig replicationConfig, boolean overWrite, boolean recursive) throws IOException; + @SuppressWarnings("checkstyle:parameternumber") + OzoneDataStreamOutput createStreamFile(String volumeName, String bucketName, + String keyName, long size, ReplicationConfig replicationConfig, + boolean overWrite, boolean recursive) throws IOException; + /** * List the status for a file or a directory and its contents. diff --git a/hadoop-ozone/client/src/main/java/org/apache/hadoop/ozone/client/rpc/RpcClient.java b/hadoop-ozone/client/src/main/java/org/apache/hadoop/ozone/client/rpc/RpcClient.java index 016eff52d38b..c0ac09a51606 100644 --- a/hadoop-ozone/client/src/main/java/org/apache/hadoop/ozone/client/rpc/RpcClient.java +++ b/hadoop-ozone/client/src/main/java/org/apache/hadoop/ozone/client/rpc/RpcClient.java @@ -91,11 +91,13 @@ import org.apache.hadoop.ozone.client.io.BlockInputStreamFactory; import org.apache.hadoop.ozone.client.io.BlockInputStreamFactoryImpl; import org.apache.hadoop.ozone.client.io.ECKeyOutputStream; +import org.apache.hadoop.ozone.client.io.KeyDataStreamOutput; import org.apache.hadoop.ozone.client.io.KeyInputStream; import org.apache.hadoop.ozone.client.io.KeyOutputStream; import org.apache.hadoop.ozone.client.io.LengthInputStream; import org.apache.hadoop.ozone.client.io.MultipartCryptoKeyInputStream; import org.apache.hadoop.ozone.client.io.OzoneCryptoInputStream; +import org.apache.hadoop.ozone.client.io.OzoneDataStreamOutput; import org.apache.hadoop.ozone.client.io.OzoneInputStream; import org.apache.hadoop.ozone.client.io.OzoneOutputStream; import org.apache.hadoop.ozone.client.protocol.ClientProtocol; @@ -1183,6 +1185,48 @@ public OzoneOutputStream createKey( return createOutputStream(openKey, requestId); } + @Override + public OzoneDataStreamOutput createStreamKey( + String volumeName, String bucketName, String keyName, long size, + ReplicationConfig replicationConfig, + Map metadata) + throws IOException { + verifyVolumeName(volumeName); + verifyBucketName(bucketName); + if (checkKeyNameEnabled) { + HddsClientUtils.verifyKeyName(keyName); + } + HddsClientUtils.checkNotNull(keyName, replicationConfig); + String requestId = UUID.randomUUID().toString(); + + OmKeyArgs.Builder builder = new OmKeyArgs.Builder() + .setVolumeName(volumeName) + .setBucketName(bucketName) + .setKeyName(keyName) + .setDataSize(size) + .setReplicationConfig(replicationConfig) + .addAllMetadata(metadata) + .setAcls(getAclList()); + + if (Boolean.parseBoolean(metadata.get(OzoneConsts.GDPR_FLAG))) { + try { + GDPRSymmetricKey gKey = new GDPRSymmetricKey(new SecureRandom()); + builder.addAllMetadata(gKey.getKeyDetails()); + } catch (Exception e) { + if (e instanceof InvalidKeyException && + e.getMessage().contains("Illegal key size or default parameters")) { + LOG.error("Missing Unlimited Strength Policy jars. Please install " + + "Java Cryptography Extension (JCE) Unlimited Strength " + + "Jurisdiction Policy Files"); + } + throw new IOException(e); + } + } + + OpenKeySession openKey = ozoneManagerClient.openKey(builder.build()); + return createDataStreamOutput(openKey, requestId, replicationConfig); + } + private KeyProvider.KeyVersion getDEK(FileEncryptionInfo feInfo) throws IOException { // check crypto protocol version @@ -1524,6 +1568,70 @@ public OzoneOutputStream createMultipartKey(String volumeName, } } + @Override + public OzoneDataStreamOutput createMultipartStreamKey( + String volumeName, + String bucketName, + String keyName, + long size, + int partNumber, + String uploadID) + throws IOException { + verifyVolumeName(volumeName); + verifyBucketName(bucketName); + if (checkKeyNameEnabled) { + HddsClientUtils.verifyKeyName(keyName); + } + HddsClientUtils.checkNotNull(keyName, uploadID); + Preconditions.checkArgument(partNumber > 0 && partNumber <= 10000, "Part " + + "number should be greater than zero and less than or equal to 10000"); + Preconditions.checkArgument(size >= 0, "size should be greater than or " + + "equal to zero"); + String requestId = UUID.randomUUID().toString(); + + OmKeyArgs keyArgs = new OmKeyArgs.Builder() + .setVolumeName(volumeName) + .setBucketName(bucketName) + .setKeyName(keyName) + .setDataSize(size) + .setIsMultipartKey(true) + .setMultipartUploadID(uploadID) + .setMultipartUploadPartNumber(partNumber) + .setAcls(getAclList()) + .build(); + + OpenKeySession openKey = ozoneManagerClient.openKey(keyArgs); + + KeyDataStreamOutput keyOutputStream = + new KeyDataStreamOutput.Builder() + .setHandler(openKey) + .setXceiverClientManager(xceiverClientManager) + .setOmClient(ozoneManagerClient) + .setRequestID(requestId) + .setReplicationConfig(openKey.getKeyInfo().getReplicationConfig()) + .setMultipartNumber(partNumber) + .setMultipartUploadID(uploadID) + .setIsMultipartKey(true) + .enableUnsafeByteBufferConversion(unsafeByteBufferConversion) + .setConfig(clientConfig) + .build(); + keyOutputStream + .addPreallocateBlocks( + openKey.getKeyInfo().getLatestVersionLocations(), + openKey.getOpenVersion()); + + FileEncryptionInfo feInfo = openKey.getKeyInfo().getFileEncryptionInfo(); + if (feInfo != null) { + // todo: need to support file encrypt, + // https://issues.apache.org/jira/browse/HDDS-5892 + throw new UnsupportedOperationException( + "FileEncryptionInfo is not yet supported in " + + "createMultipartStreamKey"); + } else { + return new OzoneDataStreamOutput(keyOutputStream); + } + } + @Override public OmMultipartUploadCompleteInfo completeMultipartUpload( String volumeName, String bucketName, String keyName, String uploadID, @@ -1734,6 +1842,25 @@ private OmKeyArgs prepareOmKeyArgs(String volumeName, String bucketName, .build(); } + @Override + public OzoneDataStreamOutput createStreamFile(String volumeName, + String bucketName, String keyName, long size, + ReplicationConfig replicationConfig, boolean overWrite, boolean recursive) + throws IOException { + OmKeyArgs keyArgs = new OmKeyArgs.Builder() + .setVolumeName(volumeName) + .setBucketName(bucketName) + .setKeyName(keyName) + .setDataSize(size) + .setReplicationConfig(replicationConfig) + .setAcls(getAclList()) + .setLatestVersionLocation(getLatestVersionLocation) + .build(); + OpenKeySession keySession = + ozoneManagerClient.createFile(keyArgs, overWrite, recursive); + return createDataStreamOutput(keySession, UUID.randomUUID().toString(), + replicationConfig); + } @Override public List listStatus(String volumeName, String bucketName, @@ -1863,6 +1990,24 @@ private OzoneInputStream createInputStream( cryptoInputStreams); } } + private OzoneDataStreamOutput createDataStreamOutput(OpenKeySession openKey, + String requestId, ReplicationConfig replicationConfig) + throws IOException { + KeyDataStreamOutput keyOutputStream = + new KeyDataStreamOutput.Builder() + .setHandler(openKey) + .setXceiverClientManager(xceiverClientManager) + .setOmClient(ozoneManagerClient) + .setRequestID(requestId) + .setReplicationConfig(replicationConfig) + .enableUnsafeByteBufferConversion(unsafeByteBufferConversion) + .setConfig(clientConfig) + .build(); + keyOutputStream + .addPreallocateBlocks(openKey.getKeyInfo().getLatestVersionLocations(), + openKey.getOpenVersion()); + return new OzoneDataStreamOutput(keyOutputStream); + } private OzoneOutputStream createOutputStream(OpenKeySession openKey, String requestId) throws IOException { diff --git a/hadoop-ozone/dev-support/intellij/runConfigurations/Datanode2.xml b/hadoop-ozone/dev-support/intellij/runConfigurations/Datanode2.xml index 3d3302030d18..040b515b9fac 100644 --- a/hadoop-ozone/dev-support/intellij/runConfigurations/Datanode2.xml +++ b/hadoop-ozone/dev-support/intellij/runConfigurations/Datanode2.xml @@ -18,7 +18,7 @@

+ * http://www.apache.org/licenses/LICENSE-2.0 + *

+ * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + */ + +package org.apache.hadoop.ozone.client.rpc; + +import org.apache.hadoop.conf.StorageUnit; +import org.apache.hadoop.hdds.client.ReplicationType; +import org.apache.hadoop.hdds.conf.OzoneConfiguration; +import org.apache.hadoop.hdds.protocol.datanode.proto.ContainerProtos; +import org.apache.hadoop.hdds.scm.OzoneClientConfig; +import org.apache.hadoop.hdds.scm.XceiverClientManager; +import org.apache.hadoop.hdds.scm.XceiverClientMetrics; +import org.apache.hadoop.hdds.scm.storage.BlockDataStreamOutput; +import org.apache.hadoop.hdds.scm.storage.ByteBufferStreamOutput; +import org.apache.hadoop.ozone.MiniOzoneCluster; +import org.apache.hadoop.ozone.OzoneConfigKeys; +import org.apache.hadoop.ozone.client.ObjectStore; +import org.apache.hadoop.ozone.client.OzoneClient; +import org.apache.hadoop.ozone.client.OzoneClientFactory; +import org.apache.hadoop.ozone.client.io.BlockDataStreamOutputEntry; +import org.apache.hadoop.ozone.client.io.KeyDataStreamOutput; +import org.apache.hadoop.ozone.client.io.OzoneDataStreamOutput; +import org.apache.hadoop.ozone.container.ContainerTestHelper; +import org.apache.hadoop.ozone.container.TestHelper; +import org.junit.AfterClass; +import org.junit.Assert; +import org.junit.BeforeClass; +import org.junit.Rule; +import org.junit.Test; +import org.junit.rules.Timeout; + +import java.io.IOException; +import java.nio.ByteBuffer; +import java.util.UUID; +import java.util.concurrent.TimeUnit; + +import static java.nio.charset.StandardCharsets.UTF_8; +import static org.apache.hadoop.hdds.scm.ScmConfigKeys.HDDS_SCM_WATCHER_TIMEOUT; +import static org.apache.hadoop.hdds.scm.ScmConfigKeys.OZONE_SCM_STALENODE_INTERVAL; + +/** + * Tests BlockDataStreamOutput class. + */ +public class TestBlockDataStreamOutput { + + /** + * Set a timeout for each test. + */ + @Rule + public Timeout timeout = Timeout.seconds(300); + private static MiniOzoneCluster cluster; + private static OzoneConfiguration conf = new OzoneConfiguration(); + private static OzoneClient client; + private static ObjectStore objectStore; + private static int chunkSize; + private static int flushSize; + private static int maxFlushSize; + private static int blockSize; + private static String volumeName; + private static String bucketName; + private static String keyString; + + /** + * Create a MiniDFSCluster for testing. + *

+ * Ozone is made active by setting OZONE_ENABLED = true + * + * @throws IOException + */ + @BeforeClass + public static void init() throws Exception { + chunkSize = 100; + flushSize = 2 * chunkSize; + maxFlushSize = 2 * flushSize; + blockSize = 2 * maxFlushSize; + + OzoneClientConfig clientConfig = conf.getObject(OzoneClientConfig.class); + conf.setFromObject(clientConfig); + + conf.setTimeDuration(HDDS_SCM_WATCHER_TIMEOUT, 1000, TimeUnit.MILLISECONDS); + conf.setTimeDuration(OZONE_SCM_STALENODE_INTERVAL, 3, TimeUnit.SECONDS); + conf.setQuietMode(false); + conf.setStorageSize(OzoneConfigKeys.OZONE_SCM_BLOCK_SIZE, 4, + StorageUnit.MB); + conf.setBoolean(OzoneConfigKeys.DFS_CONTAINER_RATIS_DATASTREAM_ENABLE, + true); + + cluster = MiniOzoneCluster.newBuilder(conf) + .setNumDatanodes(7) + .setTotalPipelineNumLimit(10) + .setBlockSize(blockSize) + .setChunkSize(chunkSize) + .setStreamBufferFlushSize(flushSize) + .setStreamBufferMaxSize(maxFlushSize) + .setDataStreamBufferFlushize(maxFlushSize) + .setStreamBufferSizeUnit(StorageUnit.BYTES) + .setDataStreamMinPacketSize(chunkSize) + .setDataStreamStreamWindowSize(5 * chunkSize) + .build(); + cluster.waitForClusterToBeReady(); + //the easiest way to create an open container is creating a key + client = OzoneClientFactory.getRpcClient(conf); + objectStore = client.getObjectStore(); + keyString = UUID.randomUUID().toString(); + volumeName = "testblockoutputstream"; + bucketName = volumeName; + objectStore.createVolume(volumeName); + objectStore.getVolume(volumeName).createBucket(bucketName); + } + + static String getKeyName() { + return UUID.randomUUID().toString(); + } + + /** + * Shutdown MiniDFSCluster. + */ + @AfterClass + public static void shutdown() { + if (cluster != null) { + cluster.shutdown(); + } + } + + @Test + public void testHalfChunkWrite() throws Exception { + testWrite(chunkSize / 2); + testWriteWithFailure(chunkSize / 2); + } + + @Test + public void testSingleChunkWrite() throws Exception { + testWrite(chunkSize); + testWriteWithFailure(chunkSize); + } + + @Test + public void testMultiChunkWrite() throws Exception { + testWrite(chunkSize + 50); + testWriteWithFailure(chunkSize + 50); + } + + @Test + public void testMultiBlockWrite() throws Exception { + testWrite(blockSize + 50); + testWriteWithFailure(blockSize + 50); + } + + static void testWrite(int dataLength) throws Exception { + String keyName = getKeyName(); + OzoneDataStreamOutput key = createKey( + keyName, ReplicationType.RATIS, dataLength); + final byte[] data = ContainerTestHelper.generateData(dataLength, false); + key.write(ByteBuffer.wrap(data)); + // now close the stream, It will update the key length. + key.close(); + validateData(keyName, data); + } + + private void testWriteWithFailure(int dataLength) throws Exception { + String keyName = getKeyName(); + OzoneDataStreamOutput key = createKey( + keyName, ReplicationType.RATIS, dataLength); + byte[] data = + ContainerTestHelper.getFixedLengthString(keyString, dataLength) + .getBytes(UTF_8); + ByteBuffer b = ByteBuffer.wrap(data); + key.write(b); + KeyDataStreamOutput keyDataStreamOutput = + (KeyDataStreamOutput) key.getByteBufStreamOutput(); + ByteBufferStreamOutput stream = + keyDataStreamOutput.getStreamEntries().get(0).getByteBufStreamOutput(); + Assert.assertTrue(stream instanceof BlockDataStreamOutput); + TestHelper.waitForContainerClose(key, cluster); + key.write(b); + key.close(); + String dataString = new String(data, UTF_8); + validateData(keyName, dataString.concat(dataString).getBytes(UTF_8)); + } + + @Test + public void testPutBlockAtBoundary() throws Exception { + int dataLength = 500; + XceiverClientMetrics metrics = + XceiverClientManager.getXceiverClientMetrics(); + long putBlockCount = metrics.getContainerOpCountMetrics( + ContainerProtos.Type.PutBlock); + long pendingPutBlockCount = metrics.getPendingContainerOpCountMetrics( + ContainerProtos.Type.PutBlock); + String keyName = getKeyName(); + OzoneDataStreamOutput key = createKey( + keyName, ReplicationType.RATIS, 0); + byte[] data = + ContainerTestHelper.getFixedLengthString(keyString, dataLength) + .getBytes(UTF_8); + key.write(ByteBuffer.wrap(data)); + Assert.assertTrue( + metrics.getPendingContainerOpCountMetrics(ContainerProtos.Type.PutBlock) + <= pendingPutBlockCount + 1); + key.close(); + // Since data length is 500 , first putBlock will be at 400(flush boundary) + // and the other at 500 + Assert.assertTrue( + metrics.getContainerOpCountMetrics(ContainerProtos.Type.PutBlock) + == putBlockCount + 2); + validateData(keyName, data); + } + + + static OzoneDataStreamOutput createKey(String keyName, ReplicationType type, + long size) throws Exception { + return TestHelper.createStreamKey( + keyName, type, size, objectStore, volumeName, bucketName); + } + static void validateData(String keyName, byte[] data) throws Exception { + TestHelper.validateData( + keyName, data, objectStore, volumeName, bucketName); + } + + + @Test + public void testMinPacketSize() throws Exception { + String keyName = getKeyName(); + XceiverClientMetrics metrics = + XceiverClientManager.getXceiverClientMetrics(); + OzoneDataStreamOutput key = createKey(keyName, ReplicationType.RATIS, 0); + long writeChunkCount = + metrics.getContainerOpCountMetrics(ContainerProtos.Type.WriteChunk); + byte[] data = + ContainerTestHelper.getFixedLengthString(keyString, chunkSize / 2) + .getBytes(UTF_8); + key.write(ByteBuffer.wrap(data)); + // minPacketSize= 100, so first write of 50 wont trigger a writeChunk + Assert.assertEquals(writeChunkCount, + metrics.getContainerOpCountMetrics(ContainerProtos.Type.WriteChunk)); + key.write(ByteBuffer.wrap(data)); + Assert.assertEquals(writeChunkCount + 1, + metrics.getContainerOpCountMetrics(ContainerProtos.Type.WriteChunk)); + // now close the stream, It will update the key length. + key.close(); + String dataString = new String(data, UTF_8); + validateData(keyName, dataString.concat(dataString).getBytes(UTF_8)); + } + + @Test + public void testTotalAckDataLength() throws Exception { + int dataLength = 400; + String keyName = getKeyName(); + OzoneDataStreamOutput key = createKey( + keyName, ReplicationType.RATIS, 0); + byte[] data = + ContainerTestHelper.getFixedLengthString(keyString, dataLength) + .getBytes(UTF_8); + KeyDataStreamOutput keyDataStreamOutput = + (KeyDataStreamOutput) key.getByteBufStreamOutput(); + BlockDataStreamOutputEntry stream = + keyDataStreamOutput.getStreamEntries().get(0); + key.write(ByteBuffer.wrap(data)); + key.close(); + Assert.assertEquals(dataLength, stream.getTotalAckDataLength()); + } + +} diff --git a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/client/rpc/TestContainerStateMachineStream.java b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/client/rpc/TestContainerStateMachineStream.java new file mode 100644 index 000000000000..86b2697b846f --- /dev/null +++ b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/client/rpc/TestContainerStateMachineStream.java @@ -0,0 +1,221 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with this + * work for additional information regarding copyright ownership. The ASF + * licenses this file to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + *

+ * http://www.apache.org/licenses/LICENSE-2.0 + *

+ * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + */ + +package org.apache.hadoop.ozone.client.rpc; + +import org.apache.hadoop.conf.StorageUnit; +import org.apache.hadoop.hdds.client.ReplicationType; +import org.apache.hadoop.hdds.conf.DatanodeRatisServerConfig; +import org.apache.hadoop.hdds.conf.OzoneConfiguration; +import org.apache.hadoop.hdds.protocol.proto.HddsProtos; +import org.apache.hadoop.hdds.ratis.conf.RatisClientConfig; +import org.apache.hadoop.hdds.scm.OzoneClientConfig; +import org.apache.hadoop.ozone.HddsDatanodeService; +import org.apache.hadoop.ozone.MiniOzoneCluster; +import org.apache.hadoop.ozone.OzoneConfigKeys; +import org.apache.hadoop.ozone.client.ObjectStore; +import org.apache.hadoop.ozone.client.OzoneClient; +import org.apache.hadoop.ozone.client.OzoneClientFactory; +import org.apache.hadoop.ozone.client.io.KeyDataStreamOutput; +import org.apache.hadoop.ozone.client.io.OzoneDataStreamOutput; +import org.apache.hadoop.ozone.container.ContainerTestHelper; +import org.apache.hadoop.ozone.container.TestHelper; +import org.apache.hadoop.ozone.om.helpers.OmKeyLocationInfo; +import org.junit.After; +import org.junit.Assert; +import org.junit.Before; +import org.junit.Rule; +import org.junit.Test; +import org.junit.rules.Timeout; + +import java.io.IOException; +import java.nio.ByteBuffer; +import java.time.Duration; +import java.util.List; +import java.util.concurrent.TimeUnit; + +import static org.apache.hadoop.hdds.HddsConfigKeys.HDDS_COMMAND_STATUS_REPORT_INTERVAL; +import static org.apache.hadoop.hdds.HddsConfigKeys.HDDS_CONTAINER_REPORT_INTERVAL; +import static org.apache.hadoop.hdds.HddsConfigKeys.HDDS_PIPELINE_REPORT_INTERVAL; +import static org.apache.hadoop.hdds.scm.ScmConfigKeys.OZONE_SCM_PIPELINE_DESTROY_TIMEOUT; +import static org.apache.hadoop.hdds.scm.ScmConfigKeys.OZONE_SCM_STALENODE_INTERVAL; + +/** + * Tests the containerStateMachine stream handling. + */ +public class TestContainerStateMachineStream { + + /** + * Set a timeout for each test. + */ + @Rule + public Timeout timeout = Timeout.seconds(300); + + private MiniOzoneCluster cluster; + private OzoneConfiguration conf = new OzoneConfiguration(); + private OzoneClient client; + private ObjectStore objectStore; + private String volumeName; + private String bucketName; + + private static final int CHUNK_SIZE = 100; + private static final int FLUSH_SIZE = 2 * CHUNK_SIZE; + private static final int MAX_FLUSH_SIZE = 2 * FLUSH_SIZE; + private static final int BLOCK_SIZE = 2 * MAX_FLUSH_SIZE; + + /** + * Create a MiniDFSCluster for testing. + * + * @throws IOException + */ + @Before + public void setup() throws Exception { + conf = new OzoneConfiguration(); + + OzoneClientConfig clientConfig = conf.getObject(OzoneClientConfig.class); + clientConfig.setStreamBufferFlushDelay(false); + conf.setFromObject(clientConfig); + + conf.setTimeDuration(HDDS_CONTAINER_REPORT_INTERVAL, 200, + TimeUnit.MILLISECONDS); + conf.setTimeDuration(HDDS_COMMAND_STATUS_REPORT_INTERVAL, 200, + TimeUnit.MILLISECONDS); + conf.setTimeDuration(HDDS_PIPELINE_REPORT_INTERVAL, 200, + TimeUnit.MILLISECONDS); + conf.setTimeDuration(OZONE_SCM_STALENODE_INTERVAL, 30, TimeUnit.SECONDS); + conf.setTimeDuration(OZONE_SCM_PIPELINE_DESTROY_TIMEOUT, 1, + TimeUnit.SECONDS); + conf.setBoolean(OzoneConfigKeys.DFS_CONTAINER_RATIS_DATASTREAM_ENABLE, + true); + + RatisClientConfig ratisClientConfig = + conf.getObject(RatisClientConfig.class); + ratisClientConfig.setWriteRequestTimeout(Duration.ofSeconds(10)); + ratisClientConfig.setWatchRequestTimeout(Duration.ofSeconds(10)); + conf.setFromObject(ratisClientConfig); + + DatanodeRatisServerConfig ratisServerConfig = + conf.getObject(DatanodeRatisServerConfig.class); + ratisServerConfig.setRequestTimeOut(Duration.ofSeconds(3)); + ratisServerConfig.setWatchTimeOut(Duration.ofSeconds(10)); + conf.setFromObject(ratisServerConfig); + + RatisClientConfig.RaftConfig raftClientConfig = + conf.getObject(RatisClientConfig.RaftConfig.class); + raftClientConfig.setRpcRequestTimeout(Duration.ofSeconds(3)); + raftClientConfig.setRpcWatchRequestTimeout(Duration.ofSeconds(10)); + conf.setFromObject(raftClientConfig); + + conf.setLong(OzoneConfigKeys.DFS_RATIS_SNAPSHOT_THRESHOLD_KEY, 1); + conf.setQuietMode(false); + cluster = + MiniOzoneCluster.newBuilder(conf) + .setNumDatanodes(3) + .setHbInterval(200) + .setDataStreamMinPacketSize(1024) + .setBlockSize(BLOCK_SIZE) + .setChunkSize(CHUNK_SIZE) + .setStreamBufferFlushSize(FLUSH_SIZE) + .setStreamBufferMaxSize(MAX_FLUSH_SIZE) + .setStreamBufferSizeUnit(StorageUnit.BYTES) + .build(); + cluster.waitForClusterToBeReady(); + cluster.waitForPipelineTobeReady(HddsProtos.ReplicationFactor.ONE, 60000); + //the easiest way to create an open container is creating a key + client = OzoneClientFactory.getRpcClient(conf); + objectStore = client.getObjectStore(); + + volumeName = "testcontainerstatemachinestream"; + bucketName = "teststreambucket"; + objectStore.createVolume(volumeName); + objectStore.getVolume(volumeName).createBucket(bucketName); + + } + + /** + * Shutdown MiniDFSCluster. + */ + @After + public void shutdown() { + if (cluster != null) { + cluster.shutdown(); + } + } + + @Test + public void testContainerStateMachineForStreaming() throws Exception { + long size = CHUNK_SIZE + 1; + + OzoneDataStreamOutput key = TestHelper.createStreamKey( + "ozone-stream-test.txt", ReplicationType.RATIS, size, objectStore, + volumeName, bucketName); + + byte[] data = ContainerTestHelper.generateData((int) size, true); + key.write(ByteBuffer.wrap(data)); + key.flush(); + + KeyDataStreamOutput streamOutput = + (KeyDataStreamOutput) key.getByteBufStreamOutput(); + List locationInfoList = + streamOutput.getLocationInfoList(); + + key.close(); + + OmKeyLocationInfo omKeyLocationInfo = locationInfoList.get(0); + HddsDatanodeService dn = TestHelper.getDatanodeService(omKeyLocationInfo, + cluster); + + long bytesUsed = dn.getDatanodeStateMachine() + .getContainer().getContainerSet() + .getContainer(omKeyLocationInfo.getContainerID()). + getContainerData().getBytesUsed(); + + Assert.assertTrue(bytesUsed == size); + } + + + @Test + public void testContainerStateMachineForStreamingSmallFile() + throws Exception { + long size = CHUNK_SIZE - 1; + + OzoneDataStreamOutput key = TestHelper.createStreamKey( + "ozone-stream-test-small-file.txt", ReplicationType.RATIS, size, + objectStore, volumeName, bucketName); + + byte[] data = ContainerTestHelper.generateData((int) size, true); + key.write(ByteBuffer.wrap(data)); + key.flush(); + + KeyDataStreamOutput streamOutput = + (KeyDataStreamOutput) key.getByteBufStreamOutput(); + List locationInfoList = + streamOutput.getLocationInfoList(); + key.close(); + OmKeyLocationInfo omKeyLocationInfo = locationInfoList.get(0); + HddsDatanodeService dn = TestHelper.getDatanodeService(omKeyLocationInfo, + cluster); + + long bytesUsed = dn.getDatanodeStateMachine() + .getContainer().getContainerSet() + .getContainer(omKeyLocationInfo.getContainerID()). + getContainerData().getBytesUsed(); + + Assert.assertTrue(bytesUsed == size); + } + +} diff --git a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/client/rpc/TestOzoneRpcClientAbstract.java b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/client/rpc/TestOzoneRpcClientAbstract.java index b8fc543f19df..8d4b02eead91 100644 --- a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/client/rpc/TestOzoneRpcClientAbstract.java +++ b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/client/rpc/TestOzoneRpcClientAbstract.java @@ -200,6 +200,7 @@ static void startCluster(OzoneConfiguration conf) throws Exception { .setTotalPipelineNumLimit(10) .setScmId(scmId) .setClusterId(clusterId) + .setDataStreamMinPacketSize(1024) .build(); cluster.waitForClusterToBeReady(); ozClient = OzoneClientFactory.getRpcClient(conf); diff --git a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/client/rpc/TestOzoneRpcClientWithRatis.java b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/client/rpc/TestOzoneRpcClientWithRatis.java index 362a218af263..d3e94ce085bb 100644 --- a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/client/rpc/TestOzoneRpcClientWithRatis.java +++ b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/client/rpc/TestOzoneRpcClientWithRatis.java @@ -19,10 +19,12 @@ package org.apache.hadoop.ozone.client.rpc; import java.io.IOException; +import java.nio.ByteBuffer; import java.util.Arrays; import java.util.HashMap; import java.util.UUID; +import org.apache.hadoop.hdds.client.ReplicationConfig; import org.apache.hadoop.hdds.client.ReplicationType; import org.apache.hadoop.hdds.conf.OzoneConfiguration; import org.apache.hadoop.hdds.scm.ScmConfigKeys; @@ -31,12 +33,15 @@ import org.apache.hadoop.ozone.client.OzoneBucket; import org.apache.hadoop.ozone.client.OzoneClient; import org.apache.hadoop.ozone.client.OzoneClientFactory; +import org.apache.hadoop.ozone.client.OzoneMultipartUploadPartListParts; import org.apache.hadoop.ozone.client.OzoneVolume; +import org.apache.hadoop.ozone.client.io.OzoneDataStreamOutput; import org.apache.hadoop.ozone.client.io.OzoneInputStream; import org.apache.hadoop.ozone.client.io.OzoneOutputStream; import org.apache.hadoop.ozone.common.OzoneChecksumException; import org.apache.hadoop.ozone.om.OMConfigKeys; import org.apache.hadoop.ozone.om.helpers.OmKeyArgs; +import org.apache.hadoop.ozone.om.helpers.OmMultipartInfo; import org.junit.jupiter.api.AfterAll; import org.junit.Assert; import org.junit.jupiter.api.BeforeAll; @@ -44,6 +49,7 @@ import static java.nio.charset.StandardCharsets.UTF_8; import static org.apache.hadoop.hdds.client.ReplicationFactor.THREE; +import static org.junit.Assert.assertNotNull; import static org.junit.Assert.fail; /** @@ -72,6 +78,8 @@ public static void init() throws Exception { conf.setBoolean(OzoneConfigKeys.OZONE_ACL_ENABLED, true); conf.set(OzoneConfigKeys.OZONE_ACL_AUTHORIZER_CLASS, OzoneConfigKeys.OZONE_ACL_AUTHORIZER_CLASS_NATIVE); + conf.setBoolean(OzoneConfigKeys.DFS_CONTAINER_RATIS_DATASTREAM_ENABLE, + true); startCluster(conf); } @@ -155,4 +163,51 @@ public void testGetKeyAndFileWithNetworkTopology() throws IOException { } } } + + @Test + public void testMultiPartUploadWithStream() throws IOException { + String volumeName = UUID.randomUUID().toString(); + String bucketName = UUID.randomUUID().toString(); + String keyName = UUID.randomUUID().toString(); + + byte[] sampleData = new byte[1024 * 8]; + + int valueLength = sampleData.length; + + getStore().createVolume(volumeName); + OzoneVolume volume = getStore().getVolume(volumeName); + volume.createBucket(bucketName); + OzoneBucket bucket = volume.getBucket(bucketName); + + ReplicationConfig replicationConfig = + ReplicationConfig.fromTypeAndFactor( + ReplicationType.RATIS, + THREE); + + OmMultipartInfo multipartInfo = bucket.initiateMultipartUpload(keyName, + replicationConfig); + + assertNotNull(multipartInfo); + String uploadID = multipartInfo.getUploadID(); + Assert.assertEquals(volumeName, multipartInfo.getVolumeName()); + Assert.assertEquals(bucketName, multipartInfo.getBucketName()); + Assert.assertEquals(keyName, multipartInfo.getKeyName()); + assertNotNull(multipartInfo.getUploadID()); + + OzoneDataStreamOutput ozoneStreamOutput = bucket.createMultipartStreamKey( + keyName, valueLength, 1, uploadID); + ozoneStreamOutput.write(ByteBuffer.wrap(sampleData), 0, + valueLength); + ozoneStreamOutput.close(); + + OzoneMultipartUploadPartListParts parts = + bucket.listParts(keyName, uploadID, 0, 1); + + Assert.assertEquals(parts.getPartInfoList().size(), 1); + + OzoneMultipartUploadPartListParts.PartInfo partInfo = + parts.getPartInfoList().get(0); + Assert.assertEquals(valueLength, partInfo.getSize()); + + } } diff --git a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/container/TestHelper.java b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/container/TestHelper.java index dae6e383f844..14cd1b66f428 100644 --- a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/container/TestHelper.java +++ b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/container/TestHelper.java @@ -46,8 +46,11 @@ import org.apache.hadoop.ozone.MiniOzoneCluster; import org.apache.hadoop.ozone.OzoneConsts; import org.apache.hadoop.ozone.client.ObjectStore; +import org.apache.hadoop.ozone.client.io.BlockDataStreamOutputEntry; import org.apache.hadoop.ozone.client.io.BlockOutputStreamEntry; +import org.apache.hadoop.ozone.client.io.KeyDataStreamOutput; import org.apache.hadoop.ozone.client.io.KeyOutputStream; +import org.apache.hadoop.ozone.client.io.OzoneDataStreamOutput; import org.apache.hadoop.ozone.client.io.OzoneInputStream; import org.apache.hadoop.ozone.client.io.OzoneOutputStream; import org.apache.hadoop.ozone.container.common.impl.ContainerData; @@ -134,8 +137,23 @@ public static OzoneOutputStream createKey(String keyName, } org.apache.hadoop.hdds.client.ReplicationFactor factor = org.apache.hadoop.hdds.client.ReplicationFactor.THREE; + ReplicationConfig config = + ReplicationConfig.fromTypeAndFactor(type, factor); return objectStore.getVolume(volumeName).getBucket(bucketName) - .createKey(keyName, size, type, factor, new HashMap<>()); + .createKey(keyName, size, config, new HashMap<>()); + } + + public static OzoneDataStreamOutput createStreamKey(String keyName, + ReplicationType type, long size, ObjectStore objectStore, + String volumeName, String bucketName) throws Exception { + org.apache.hadoop.hdds.client.ReplicationFactor factor = + type == ReplicationType.STAND_ALONE ? + org.apache.hadoop.hdds.client.ReplicationFactor.ONE : + org.apache.hadoop.hdds.client.ReplicationFactor.THREE; + ReplicationConfig config = + ReplicationConfig.fromTypeAndFactor(type, factor); + return objectStore.getVolume(volumeName).getBucket(bucketName) + .createStreamKey(keyName, size, config, new HashMap<>()); } public static OzoneOutputStream createKey(String keyName, @@ -143,8 +161,10 @@ public static OzoneOutputStream createKey(String keyName, org.apache.hadoop.hdds.client.ReplicationFactor factor, long size, ObjectStore objectStore, String volumeName, String bucketName) throws Exception { + ReplicationConfig config = + ReplicationConfig.fromTypeAndFactor(type, factor); return objectStore.getVolume(volumeName).getBucket(bucketName) - .createKey(keyName, size, type, factor, new HashMap<>()); + .createKey(keyName, size, config, new HashMap<>()); } public static OzoneOutputStream createKey(String keyName, @@ -187,6 +207,24 @@ public static void waitForContainerClose(OzoneOutputStream outputStream, waitForContainerClose(cluster, containerIdList.toArray(new Long[0])); } + + public static void waitForContainerClose(OzoneDataStreamOutput outputStream, + MiniOzoneCluster cluster) throws Exception { + KeyDataStreamOutput keyOutputStream = + (KeyDataStreamOutput) outputStream.getByteBufStreamOutput(); + List streamEntryList = + keyOutputStream.getStreamEntries(); + List containerIdList = new ArrayList<>(); + for (BlockDataStreamOutputEntry entry : streamEntryList) { + long id = entry.getBlockID().getContainerID(); + if (!containerIdList.contains(id)) { + containerIdList.add(id); + } + } + Assert.assertTrue(!containerIdList.isEmpty()); + waitForContainerClose(cluster, containerIdList.toArray(new Long[0])); + } + public static void waitForPipelineClose(OzoneOutputStream outputStream, MiniOzoneCluster cluster, boolean waitForContainerCreation) throws Exception { diff --git a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/container/server/TestSecureContainerServer.java b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/container/server/TestSecureContainerServer.java index cd7c99554438..cb2db30c1023 100644 --- a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/container/server/TestSecureContainerServer.java +++ b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/container/server/TestSecureContainerServer.java @@ -216,6 +216,8 @@ static XceiverServerRatis newXceiverServerRatis( DatanodeDetails dn, OzoneConfiguration conf) throws IOException { conf.setInt(OzoneConfigKeys.DFS_CONTAINER_RATIS_IPC_PORT, dn.getPort(DatanodeDetails.Port.Name.RATIS).getValue()); + conf.setBoolean( + OzoneConfigKeys.DFS_CONTAINER_RATIS_DATASTREAM_RANDOM_PORT, true); final String dir = TEST_DIR + dn.getUuid(); conf.set(OzoneConfigKeys.DFS_CONTAINER_RATIS_DATANODE_STORAGE_DIR, dir); final ContainerDispatcher dispatcher = createDispatcher(dn, diff --git a/hadoop-ozone/ozonefs-common/src/main/java/org/apache/hadoop/fs/ozone/BasicOzoneClientAdapterImpl.java b/hadoop-ozone/ozonefs-common/src/main/java/org/apache/hadoop/fs/ozone/BasicOzoneClientAdapterImpl.java index bec00e929963..be919b76fe4c 100644 --- a/hadoop-ozone/ozonefs-common/src/main/java/org/apache/hadoop/fs/ozone/BasicOzoneClientAdapterImpl.java +++ b/hadoop-ozone/ozonefs-common/src/main/java/org/apache/hadoop/fs/ozone/BasicOzoneClientAdapterImpl.java @@ -53,6 +53,7 @@ import org.apache.hadoop.ozone.client.OzoneClientFactory; import org.apache.hadoop.ozone.client.OzoneKey; import org.apache.hadoop.ozone.client.OzoneVolume; +import org.apache.hadoop.ozone.client.io.OzoneDataStreamOutput; import org.apache.hadoop.ozone.client.io.OzoneOutputStream; import org.apache.hadoop.ozone.common.MonotonicClock; import org.apache.hadoop.ozone.om.exceptions.OMException; @@ -265,6 +266,38 @@ private ReplicationConfig getReplicationConfigWithRefreshCheck() return this.bucketReplicationConfig; } + @Override + public OzoneFSDataStreamOutput createStreamFile(String key, short replication, + boolean overWrite, boolean recursive) throws IOException { + incrementCounter(Statistic.OBJECTS_CREATED, 1); + try { + OzoneDataStreamOutput ozoneDataStreamOutput = null; + if (replication == ReplicationFactor.ONE.getValue() + || replication == ReplicationFactor.THREE.getValue()) { + + ReplicationConfig customReplicationConfig = + ReplicationConfig.adjustReplication(bucketReplicationConfig, + replication, config); + ozoneDataStreamOutput = bucket + .createStreamFile(key, 0, customReplicationConfig, overWrite, + recursive); + } else { + ozoneDataStreamOutput = bucket.createStreamFile( + key, 0, bucketReplicationConfig, overWrite, recursive); + } + return new OzoneFSDataStreamOutput( + ozoneDataStreamOutput.getByteBufStreamOutput()); + } catch (OMException ex) { + if (ex.getResult() == OMException.ResultCodes.FILE_ALREADY_EXISTS + || ex.getResult() == OMException.ResultCodes.NOT_A_FILE) { + throw new FileAlreadyExistsException( + ex.getResult().name() + ": " + ex.getMessage()); + } else { + throw ex; + } + } + } + @Override public void renameKey(String key, String newKeyName) throws IOException { incrementCounter(Statistic.OBJECTS_RENAMED, 1); diff --git a/hadoop-ozone/ozonefs-common/src/main/java/org/apache/hadoop/fs/ozone/BasicOzoneFileSystem.java b/hadoop-ozone/ozonefs-common/src/main/java/org/apache/hadoop/fs/ozone/BasicOzoneFileSystem.java index 910ca455848f..32a9f3181188 100644 --- a/hadoop-ozone/ozonefs-common/src/main/java/org/apache/hadoop/fs/ozone/BasicOzoneFileSystem.java +++ b/hadoop-ozone/ozonefs-common/src/main/java/org/apache/hadoop/fs/ozone/BasicOzoneFileSystem.java @@ -42,6 +42,7 @@ import org.apache.hadoop.hdds.conf.OzoneConfiguration; import org.apache.hadoop.hdds.conf.StorageUnit; import org.apache.hadoop.hdds.utils.LegacyHadoopConfigurationSource; +import org.apache.hadoop.ozone.OzoneConfigKeys; import org.apache.hadoop.ozone.om.exceptions.OMException; import org.apache.hadoop.ozone.om.helpers.OzoneFSUtils; import org.apache.hadoop.security.UserGroupInformation; @@ -259,6 +260,13 @@ public FSDataOutputStream createNonRecursive(Path path, private FSDataOutputStream createOutputStream(String key, short replication, boolean overwrite, boolean recursive) throws IOException { + boolean isRatisStreamingEnabled = getConf().getBoolean( + OzoneConfigKeys.OZONE_FS_DATASTREAM_ENABLE, + OzoneConfigKeys.OZONE_FS_DATASTREAM_ENABLE_DEFAULT); + if (isRatisStreamingEnabled) { + return new FSDataOutputStream(adapter.createStreamFile(key, + replication, overwrite, recursive), statistics); + } return new FSDataOutputStream(adapter.createFile(key, replication, overwrite, recursive), statistics); } diff --git a/hadoop-ozone/ozonefs-common/src/main/java/org/apache/hadoop/fs/ozone/BasicRootedOzoneClientAdapterImpl.java b/hadoop-ozone/ozonefs-common/src/main/java/org/apache/hadoop/fs/ozone/BasicRootedOzoneClientAdapterImpl.java index 052c6c8ba6ca..5295aa33d688 100644 --- a/hadoop-ozone/ozonefs-common/src/main/java/org/apache/hadoop/fs/ozone/BasicRootedOzoneClientAdapterImpl.java +++ b/hadoop-ozone/ozonefs-common/src/main/java/org/apache/hadoop/fs/ozone/BasicRootedOzoneClientAdapterImpl.java @@ -60,6 +60,7 @@ import org.apache.hadoop.ozone.client.OzoneClientFactory; import org.apache.hadoop.ozone.client.OzoneKey; import org.apache.hadoop.ozone.client.OzoneVolume; +import org.apache.hadoop.ozone.client.io.OzoneDataStreamOutput; import org.apache.hadoop.ozone.client.io.OzoneOutputStream; import org.apache.hadoop.ozone.client.protocol.ClientProtocol; import org.apache.hadoop.ozone.om.exceptions.OMException; @@ -352,6 +353,44 @@ public OzoneFSOutputStream createFile(String pathStr, short replication, } } + @Override + public OzoneFSDataStreamOutput createStreamFile(String pathStr, + short replication, boolean overWrite, boolean recursive) + throws IOException { + incrementCounter(Statistic.OBJECTS_CREATED, 1); + OFSPath ofsPath = new OFSPath(pathStr); + if (ofsPath.isRoot() || ofsPath.isVolume() || ofsPath.isBucket()) { + throw new IOException("Cannot create file under root or volume."); + } + String key = ofsPath.getKeyName(); + try { + // Hadoop CopyCommands class always sets recursive to true + OzoneBucket bucket = getBucket(ofsPath, recursive); + OzoneDataStreamOutput ozoneDataStreamOutput = null; + if (replication == ReplicationFactor.ONE.getValue() + || replication == ReplicationFactor.THREE.getValue()) { + + ozoneDataStreamOutput = bucket.createStreamFile(key, 0, + ReplicationConfig.adjustReplication( + clientConfiguredReplicationConfig, replication, config), + overWrite, recursive); + } else { + ozoneDataStreamOutput = bucket.createStreamFile( + key, 0, clientConfiguredReplicationConfig, overWrite, recursive); + } + return new OzoneFSDataStreamOutput( + ozoneDataStreamOutput.getByteBufStreamOutput()); + } catch (OMException ex) { + if (ex.getResult() == OMException.ResultCodes.FILE_ALREADY_EXISTS + || ex.getResult() == OMException.ResultCodes.NOT_A_FILE) { + throw new FileAlreadyExistsException( + ex.getResult().name() + ": " + ex.getMessage()); + } else { + throw ex; + } + } + } + @Override public void renameKey(String key, String newKeyName) throws IOException { throw new IOException("OFS doesn't support renameKey, use rename instead."); diff --git a/hadoop-ozone/ozonefs-common/src/main/java/org/apache/hadoop/fs/ozone/BasicRootedOzoneFileSystem.java b/hadoop-ozone/ozonefs-common/src/main/java/org/apache/hadoop/fs/ozone/BasicRootedOzoneFileSystem.java index 0360e345b291..600abdf4f199 100644 --- a/hadoop-ozone/ozonefs-common/src/main/java/org/apache/hadoop/fs/ozone/BasicRootedOzoneFileSystem.java +++ b/hadoop-ozone/ozonefs-common/src/main/java/org/apache/hadoop/fs/ozone/BasicRootedOzoneFileSystem.java @@ -41,6 +41,7 @@ import org.apache.hadoop.hdds.conf.StorageUnit; import org.apache.hadoop.hdds.utils.LegacyHadoopConfigurationSource; import org.apache.hadoop.ozone.OFSPath; +import org.apache.hadoop.ozone.OzoneConfigKeys; import org.apache.hadoop.ozone.client.OzoneBucket; import org.apache.hadoop.ozone.client.OzoneVolume; import org.apache.hadoop.ozone.om.exceptions.OMException; @@ -235,6 +236,13 @@ public FSDataOutputStream createNonRecursive(Path path, private FSDataOutputStream createOutputStream(String key, short replication, boolean overwrite, boolean recursive) throws IOException { + boolean isRatisStreamingEnabled = getConf().getBoolean( + OzoneConfigKeys.OZONE_FS_DATASTREAM_ENABLE, + OzoneConfigKeys.OZONE_FS_DATASTREAM_ENABLE_DEFAULT); + if (isRatisStreamingEnabled) { + return new FSDataOutputStream(adapter.createStreamFile(key, + replication, overwrite, recursive), statistics); + } return new FSDataOutputStream(adapter.createFile(key, replication, overwrite, recursive), statistics); } diff --git a/hadoop-ozone/ozonefs-common/src/main/java/org/apache/hadoop/fs/ozone/OzoneClientAdapter.java b/hadoop-ozone/ozonefs-common/src/main/java/org/apache/hadoop/fs/ozone/OzoneClientAdapter.java index 31bf351f01a6..24566cb83f8b 100644 --- a/hadoop-ozone/ozonefs-common/src/main/java/org/apache/hadoop/fs/ozone/OzoneClientAdapter.java +++ b/hadoop-ozone/ozonefs-common/src/main/java/org/apache/hadoop/fs/ozone/OzoneClientAdapter.java @@ -45,6 +45,9 @@ public interface OzoneClientAdapter { OzoneFSOutputStream createFile(String key, short replication, boolean overWrite, boolean recursive) throws IOException; + OzoneFSDataStreamOutput createStreamFile(String key, short replication, + boolean overWrite, boolean recursive) throws IOException; + void renameKey(String key, String newKeyName) throws IOException; // Users should use rename instead of renameKey in OFS. diff --git a/hadoop-ozone/ozonefs-common/src/main/java/org/apache/hadoop/fs/ozone/OzoneFSDataStreamOutput.java b/hadoop-ozone/ozonefs-common/src/main/java/org/apache/hadoop/fs/ozone/OzoneFSDataStreamOutput.java new file mode 100644 index 000000000000..515dbca92b42 --- /dev/null +++ b/hadoop-ozone/ozonefs-common/src/main/java/org/apache/hadoop/fs/ozone/OzoneFSDataStreamOutput.java @@ -0,0 +1,103 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.fs.ozone; + +import org.apache.hadoop.hdds.scm.storage.ByteBufferStreamOutput; + +import java.io.IOException; +import java.io.OutputStream; +import java.nio.ByteBuffer; + +/** + * The ByteBuffer output stream for Ozone file system. + */ +public class OzoneFSDataStreamOutput extends OutputStream + implements ByteBufferStreamOutput { + + private final ByteBufferStreamOutput byteBufferStreamOutput; + + public OzoneFSDataStreamOutput( + ByteBufferStreamOutput byteBufferStreamOutput) { + this.byteBufferStreamOutput = byteBufferStreamOutput; + } + + /** + * Try to write the [off:off + len) slice in ByteBuf b to DataStream. + * + * @param b the data. + * @param off the start offset in the data. + * @param len the number of bytes to write. + * @throws IOException if an I/O error occurs. + */ + @Override + public void write(ByteBuffer b, int off, int len) + throws IOException { + byteBufferStreamOutput.write(b, off, len); + } + + /** + * Writes the specified byte to this output stream. The general + * contract for write is that one byte is written + * to the output stream. The byte to be written is the eight + * low-order bits of the argument b. The 24 + * high-order bits of b are ignored. + *

+ * Subclasses of OutputStream must provide an + * implementation for this method. + * + * @param b the byte. + * @throws IOException if an I/O error occurs. In particular, + * an IOException may be thrown if the + * output stream has been closed. + */ + @Override + public void write(int b) throws IOException { + byte[] singleBytes = new byte[1]; + singleBytes[0] = (byte) b; + byteBufferStreamOutput.write(ByteBuffer.wrap(singleBytes)); + } + + /** + * Flushes this DataStream output and forces any buffered output bytes + * to be written out. + * + * @throws IOException if an I/O error occurs. + */ + @Override + public void flush() throws IOException { + byteBufferStreamOutput.flush(); + } + + /** + * Closes this stream and releases any system resources associated + * with it. If the stream is already closed then invoking this + * method has no effect. + * + *

As noted in {@link AutoCloseable#close()}, cases where the + * close may fail require careful attention. It is strongly advised + * to relinquish the underlying resources and to internally + * mark the {@code Closeable} as closed, prior to throwing + * the {@code IOException}. + * + * @throws IOException if an I/O error occurs + */ + @Override + public void close() throws IOException { + byteBufferStreamOutput.close(); + } +} diff --git a/hadoop-ozone/s3gateway/src/main/java/org/apache/hadoop/ozone/s3/endpoint/ObjectEndpoint.java b/hadoop-ozone/s3gateway/src/main/java/org/apache/hadoop/ozone/s3/endpoint/ObjectEndpoint.java index d917fdc113c1..cf7eed7f3a6b 100644 --- a/hadoop-ozone/s3gateway/src/main/java/org/apache/hadoop/ozone/s3/endpoint/ObjectEndpoint.java +++ b/hadoop-ozone/s3gateway/src/main/java/org/apache/hadoop/ozone/s3/endpoint/ObjectEndpoint.java @@ -53,8 +53,9 @@ import java.util.Map; import java.util.OptionalLong; -import org.apache.commons.lang3.StringUtils; +import org.apache.hadoop.hdds.client.ECReplicationConfig; import org.apache.hadoop.hdds.client.ReplicationConfig; +import org.apache.commons.lang3.StringUtils; import org.apache.hadoop.hdds.client.ReplicationType; import org.apache.hadoop.hdds.conf.OzoneConfiguration; import org.apache.hadoop.hdds.conf.StorageUnit; @@ -90,9 +91,14 @@ import org.apache.commons.lang3.tuple.Pair; +import static org.apache.hadoop.hdds.protocol.proto.HddsProtos.ReplicationType.EC; import static org.apache.hadoop.ozone.OzoneConfigKeys.OZONE_REPLICATION; import static org.apache.hadoop.ozone.OzoneConfigKeys.OZONE_REPLICATION_TYPE; import static org.apache.hadoop.ozone.OzoneConfigKeys.OZONE_REPLICATION_TYPE_DEFAULT; +import static org.apache.hadoop.hdds.scm.ScmConfigKeys.OZONE_SCM_CHUNK_SIZE_DEFAULT; +import static org.apache.hadoop.hdds.scm.ScmConfigKeys.OZONE_SCM_CHUNK_SIZE_KEY; +import static org.apache.hadoop.ozone.OzoneConfigKeys.DFS_CONTAINER_RATIS_DATASTREAM_ENABLE; +import static org.apache.hadoop.ozone.OzoneConfigKeys.DFS_CONTAINER_RATIS_DATASTREAM_ENABLE_DEFAULT; import static org.apache.hadoop.ozone.om.OMConfigKeys.OZONE_OM_ENABLE_FILESYSTEM_PATHS; import static org.apache.hadoop.ozone.s3.S3GatewayConfigKeys.OZONE_S3G_CLIENT_BUFFER_SIZE_DEFAULT; import static org.apache.hadoop.ozone.s3.S3GatewayConfigKeys.OZONE_S3G_CLIENT_BUFFER_SIZE_KEY; @@ -132,6 +138,8 @@ public class ObjectEndpoint extends EndpointBase { private List customizableGetHeaders = new ArrayList<>(); private int bufferSize; + private int chunkSize; + private boolean datastreamEnabled; public ObjectEndpoint() { customizableGetHeaders.add("Content-Type"); @@ -150,6 +158,13 @@ public void init() { bufferSize = (int) ozoneConfiguration.getStorageSize( OZONE_S3G_CLIENT_BUFFER_SIZE_KEY, OZONE_S3G_CLIENT_BUFFER_SIZE_DEFAULT, StorageUnit.BYTES); + chunkSize = (int) ozoneConfiguration.getStorageSize( + OZONE_SCM_CHUNK_SIZE_KEY, + OZONE_SCM_CHUNK_SIZE_DEFAULT, + StorageUnit.BYTES); + datastreamEnabled = ozoneConfiguration.getBoolean( + DFS_CONTAINER_RATIS_DATASTREAM_ENABLE, + DFS_CONTAINER_RATIS_DATASTREAM_ENABLE_DEFAULT); } /** @@ -185,11 +200,21 @@ public Response put( storageType = headers.getHeaderString(STORAGE_CLASS_HEADER); boolean storageTypeDefault = StringUtils.isEmpty(storageType); + if (storageTypeDefault) { + storageType = S3StorageType.getDefault(ozoneConfiguration).toString(); + } + // Normal put object OzoneBucket bucket = getBucket(bucketName); ReplicationConfig replicationConfig = getReplicationConfig(bucket, storageType); + boolean enableEC = false; + if ((replicationConfig != null && + replicationConfig.getReplicationType() == EC) || + bucket.getReplicationConfig() instanceof ECReplicationConfig) { + enableEC = true; + } if (copyHeader != null) { //Copy object, as copy source available. s3GAction = S3GAction.COPY_OBJECT; @@ -199,15 +224,20 @@ public Response put( "Connection", "close").build(); } - output = - bucket.createKey(keyPath, length, replicationConfig, new HashMap<>()); - if ("STREAMING-AWS4-HMAC-SHA256-PAYLOAD" .equals(headers.getHeaderString("x-amz-content-sha256"))) { body = new SignedChunksInputStream(body); } - IOUtils.copy(body, output); + if (datastreamEnabled && !enableEC) { + return ObjectEndpointStreaming + .put(bucket, keyPath, length, replicationConfig, chunkSize, body); + } else { + output = + bucket + .createKey(keyPath, length, replicationConfig, new HashMap<>()); + IOUtils.copy(body, output); + } getMetrics().incCreateKeySuccess(); return Response.ok().status(HttpStatus.SC_OK) @@ -690,7 +720,6 @@ private Response createMultipartKey(String bucket, String key, long length, throws IOException, OS3Exception { try { OzoneBucket ozoneBucket = getBucket(bucket); - String copyHeader; OzoneOutputStream ozoneOutputStream = null; if ("STREAMING-AWS4-HMAC-SHA256-PAYLOAD" @@ -698,10 +727,36 @@ private Response createMultipartKey(String bucket, String key, long length, body = new SignedChunksInputStream(body); } + String copyHeader = headers.getHeaderString(COPY_SOURCE_HEADER); + String storageType = headers.getHeaderString(STORAGE_CLASS_HEADER); + ReplicationConfig replicationConfig = + getReplicationConfig(ozoneBucket, storageType); + + boolean enableEC = false; + if ((replicationConfig != null && + replicationConfig.getReplicationType() == EC) || + ozoneBucket.getReplicationConfig() instanceof ECReplicationConfig) { + enableEC = true; + } + try { + if (datastreamEnabled && !enableEC && copyHeader != null) { + Pair result = parseSourceHeader(copyHeader); + String sourceBucket = result.getLeft(); + String sourceKey = result.getRight(); + OzoneBucket sourceOzoneBucket = getBucket(sourceBucket); + return ObjectEndpointStreaming + .copyMultipartKey(Pair.of(sourceOzoneBucket, sourceKey), + Pair.of(ozoneBucket, key), length, partNumber, uploadID, + chunkSize, headers); + } else if (datastreamEnabled && !enableEC) { + return ObjectEndpointStreaming + .createMultipartKey(ozoneBucket, key, length, partNumber, + uploadID, chunkSize, body); + } + ozoneOutputStream = ozoneBucket.createMultipartKey( key, length, partNumber, uploadID); - copyHeader = headers.getHeaderString(COPY_SOURCE_HEADER); if (copyHeader != null) { Pair result = parseSourceHeader(copyHeader); @@ -734,6 +789,7 @@ private Response createMultipartKey(String bucket, String key, long length, "Bytes to skip: " + rangeHeader.getStartOffset() + " actual: " + skipped); } + IOUtils.copyLarge(sourceObject, ozoneOutputStream, 0, rangeHeader.getEndOffset() - rangeHeader.getStartOffset() + 1); @@ -881,7 +937,6 @@ private CopyObjectResponse copyObject(String copyHeader, } } - OzoneBucket sourceOzoneBucket = getBucket(sourceBucket); OzoneBucket destOzoneBucket = destBucket; @@ -890,15 +945,22 @@ private CopyObjectResponse copyObject(String copyHeader, sourceInputStream = sourceOzoneBucket.readKey(sourceKey); - destOutputStream = destOzoneBucket - .createKey(destkey, sourceKeyLen, replicationConfig, new HashMap<>()); + if (datastreamEnabled) { + ObjectEndpointStreaming + .putKeyWithStream(destOzoneBucket, destkey, sourceKeyLen, chunkSize, + replicationConfig, new HashMap<>(), sourceInputStream); + } else { + destOutputStream = destOzoneBucket + .createKey(destkey, sourceKeyLen, replicationConfig, + new HashMap<>()); - IOUtils.copy(sourceInputStream, destOutputStream); + IOUtils.copy(sourceInputStream, destOutputStream); + destOutputStream.close(); + } // Closing here, as if we don't call close this key will not commit in // OM, and getKey fails. sourceInputStream.close(); - destOutputStream.close(); closed = true; OzoneKeyDetails destKeyDetails = destOzoneBucket.getKey(destkey); @@ -991,7 +1053,8 @@ private static OptionalLong parseAndValidateDate(String ozoneDateStr) { } } - private boolean checkCopySourceModificationTime(Long lastModificationTime, + public static boolean checkCopySourceModificationTime( + Long lastModificationTime, String copySourceIfModifiedSinceStr, String copySourceIfUnmodifiedSinceStr) { long copySourceIfModifiedSince = Long.MIN_VALUE; @@ -1016,4 +1079,14 @@ private boolean checkCopySourceModificationTime(Long lastModificationTime, public void setOzoneConfiguration(OzoneConfiguration config) { this.ozoneConfiguration = config; } + + @VisibleForTesting + public boolean isDatastreamEnabled() { + return datastreamEnabled; + } + + @VisibleForTesting + public void setDatastreamEnabled(boolean datastreamEnabled) { + this.datastreamEnabled = datastreamEnabled; + } } diff --git a/hadoop-ozone/s3gateway/src/main/java/org/apache/hadoop/ozone/s3/endpoint/ObjectEndpointStreaming.java b/hadoop-ozone/s3gateway/src/main/java/org/apache/hadoop/ozone/s3/endpoint/ObjectEndpointStreaming.java new file mode 100644 index 000000000000..1d1ad3c4b06a --- /dev/null +++ b/hadoop-ozone/s3gateway/src/main/java/org/apache/hadoop/ozone/s3/endpoint/ObjectEndpointStreaming.java @@ -0,0 +1,255 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + *

+ * http://www.apache.org/licenses/LICENSE-2.0 + *

+ * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.ozone.s3.endpoint; + +import org.apache.commons.lang3.tuple.Pair; +import org.apache.hadoop.hdds.client.ReplicationConfig; +import org.apache.hadoop.ozone.client.OzoneBucket; +import org.apache.hadoop.ozone.client.io.OzoneDataStreamOutput; +import org.apache.hadoop.ozone.client.io.OzoneInputStream; +import org.apache.hadoop.ozone.om.exceptions.OMException; +import org.apache.hadoop.ozone.om.helpers.OmMultipartCommitUploadPartInfo; +import org.apache.hadoop.ozone.s3.exception.OS3Exception; +import org.apache.hadoop.ozone.s3.exception.S3ErrorTable; +import org.apache.hadoop.ozone.s3.util.RangeHeader; +import org.apache.hadoop.ozone.s3.util.RangeHeaderParserUtil; +import org.apache.http.HttpStatus; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import javax.ws.rs.core.HttpHeaders; +import javax.ws.rs.core.Response; +import java.io.EOFException; +import java.io.IOException; +import java.io.InputStream; +import java.nio.ByteBuffer; +import java.util.HashMap; +import java.util.Map; + +import static org.apache.hadoop.ozone.om.OMConfigKeys.OZONE_OM_ENABLE_FILESYSTEM_PATHS; +import static org.apache.hadoop.ozone.s3.exception.S3ErrorTable.INVALID_REQUEST; +import static org.apache.hadoop.ozone.s3.exception.S3ErrorTable.NO_SUCH_UPLOAD; +import static org.apache.hadoop.ozone.s3.exception.S3ErrorTable.PRECOND_FAILED; +import static org.apache.hadoop.ozone.s3.util.S3Consts.COPY_SOURCE_HEADER_RANGE; +import static org.apache.hadoop.ozone.s3.util.S3Consts.COPY_SOURCE_IF_MODIFIED_SINCE; +import static org.apache.hadoop.ozone.s3.util.S3Consts.COPY_SOURCE_IF_UNMODIFIED_SINCE; + +/** + * Key level rest endpoints for Streaming. + */ +final class ObjectEndpointStreaming { + + private static final Logger LOG = + LoggerFactory.getLogger(ObjectEndpointStreaming.class); + + private ObjectEndpointStreaming() { + } + + public static Response put(OzoneBucket bucket, String keyPath, + long length, ReplicationConfig replicationConfig, + int chunkSize, InputStream body) + throws IOException, OS3Exception { + + try { + Map keyMetadata = new HashMap<>(); + putKeyWithStream(bucket, keyPath, + length, chunkSize, replicationConfig, keyMetadata, body); + return Response.ok().status(HttpStatus.SC_OK).build(); + } catch (IOException ex) { + LOG.error("Exception occurred in PutObject", ex); + if (ex instanceof OMException) { + if (((OMException) ex).getResult() == + OMException.ResultCodes.NOT_A_FILE) { + OS3Exception os3Exception = S3ErrorTable.newError(INVALID_REQUEST, + keyPath); + os3Exception.setErrorMessage("An error occurred (InvalidRequest) " + + "when calling the PutObject/MPU PartUpload operation: " + + OZONE_OM_ENABLE_FILESYSTEM_PATHS + " is enabled Keys are" + + " considered as Unix Paths. Path has Violated FS Semantics " + + "which caused put operation to fail."); + throw os3Exception; + } else if ((((OMException) ex).getResult() == + OMException.ResultCodes.PERMISSION_DENIED)) { + throw S3ErrorTable.newError(S3ErrorTable.ACCESS_DENIED, keyPath); + } + } + throw ex; + } + } + + public static void putKeyWithStream(OzoneBucket bucket, + String keyPath, + long length, + int bufferSize, + ReplicationConfig replicationConfig, + Map keyMetadata, + InputStream body) + throws IOException { + try (OzoneDataStreamOutput streamOutput = bucket.createStreamKey(keyPath, + length, replicationConfig, keyMetadata)) { + writeToStreamOutput(streamOutput, body, bufferSize, length); + } + } + + private static void writeToStreamOutput(OzoneDataStreamOutput streamOutput, + InputStream body, int bufferSize) + throws IOException { + writeToStreamOutput(streamOutput, body, bufferSize, Long.MAX_VALUE); + } + + private static void writeToStreamOutput(OzoneDataStreamOutput streamOutput, + InputStream body, int bufferSize, + long length) + throws IOException { + byte[] buffer = new byte[bufferSize]; + ByteBuffer writeByteBuffer; + long total = 0; + do { + int realBufferSize = (int) (length - total); + if (realBufferSize > 0 && realBufferSize < bufferSize) { + buffer = new byte[realBufferSize]; + } + int nn = body.read(buffer); + if (nn == -1) { + break; + } else if (nn != bufferSize) { + byte[] subBuffer = new byte[nn]; + System.arraycopy(buffer, 0, subBuffer, 0, nn); + writeByteBuffer = ByteBuffer.wrap(subBuffer, 0, nn); + } else { + writeByteBuffer = ByteBuffer.wrap(buffer, 0, nn); + } + streamOutput.write(writeByteBuffer, 0, nn); + total += nn; + } while (total != length); + } + + + public static Response createMultipartKey(OzoneBucket ozoneBucket, String key, + long length, int partNumber, + String uploadID, int chunkSize, + InputStream body) + throws IOException, OS3Exception { + try { + OzoneDataStreamOutput streamOutput = null; + try (OzoneDataStreamOutput ozoneStreamOutput = ozoneBucket + .createMultipartStreamKey( + key, length, partNumber, uploadID)) { + writeToStreamOutput(ozoneStreamOutput, body, chunkSize); + streamOutput = ozoneStreamOutput; + } + + String eTag = ""; + if (streamOutput != null) { + OmMultipartCommitUploadPartInfo omMultipartCommitUploadPartInfo = + streamOutput.getCommitUploadPartInfo(); + eTag = omMultipartCommitUploadPartInfo.getPartName(); + } + + return Response.ok().header("ETag", + eTag).build(); + } catch (OMException ex) { + if (ex.getResult() == + OMException.ResultCodes.NO_SUCH_MULTIPART_UPLOAD_ERROR) { + throw S3ErrorTable.newError(NO_SUCH_UPLOAD, + uploadID); + } else if (ex.getResult() == OMException.ResultCodes.PERMISSION_DENIED) { + throw S3ErrorTable.newError(S3ErrorTable.ACCESS_DENIED, + ozoneBucket.getName() + "/" + key); + } + throw ex; + } + } + + public static Response copyMultipartKey(Pair source, + Pair target, + long length, int partNumber, + String uploadID, int chunkSize, + HttpHeaders headers) + throws IOException, OS3Exception { + + OzoneBucket sourceBucket = source.getLeft(); + OzoneBucket ozoneBucket = target.getLeft(); + String sourceKey = source.getRight(); + String key = target.getRight(); + + try { + OzoneDataStreamOutput ozoneStreamOutput = ozoneBucket + .createMultipartStreamKey(key, length, partNumber, uploadID); + + Long sourceKeyModificationTime = sourceBucket. + getKey(sourceKey).getModificationTime().toEpochMilli(); + String copySourceIfModifiedSince = + headers.getHeaderString(COPY_SOURCE_IF_MODIFIED_SINCE); + String copySourceIfUnmodifiedSince = + headers.getHeaderString(COPY_SOURCE_IF_UNMODIFIED_SINCE); + if (!ObjectEndpoint + .checkCopySourceModificationTime(sourceKeyModificationTime, + copySourceIfModifiedSince, copySourceIfUnmodifiedSince)) { + throw S3ErrorTable.newError(PRECOND_FAILED, + sourceBucket + "/" + sourceKey); + } + + try (OzoneInputStream sourceObject = + sourceBucket.readKey(sourceKey)) { + + String range = + headers.getHeaderString(COPY_SOURCE_HEADER_RANGE); + if (range != null) { + RangeHeader rangeHeader = + RangeHeaderParserUtil.parseRangeHeader(range, 0); + LOG.info("Copy range {} after parse {}", range, rangeHeader); + final long skipped = + sourceObject.skip(rangeHeader.getStartOffset()); + if (skipped != rangeHeader.getStartOffset()) { + throw new EOFException( + "Bytes to skip: " + + rangeHeader.getStartOffset() + " actual: " + skipped); + } + writeToStreamOutput(ozoneStreamOutput, sourceObject, chunkSize, + rangeHeader.getEndOffset() - rangeHeader.getStartOffset() + + 1); + } else { + writeToStreamOutput(ozoneStreamOutput, sourceObject, chunkSize); + } + } + + String eTag = ""; + if (ozoneStreamOutput != null) { + ozoneStreamOutput.close(); + OmMultipartCommitUploadPartInfo omMultipartCommitUploadPartInfo = + ozoneStreamOutput.getCommitUploadPartInfo(); + eTag = omMultipartCommitUploadPartInfo.getPartName(); + } + + return Response.ok(new CopyPartResult(eTag)).build(); + } catch (OMException ex) { + if (ex.getResult() == + OMException.ResultCodes.NO_SUCH_MULTIPART_UPLOAD_ERROR) { + throw S3ErrorTable.newError(NO_SUCH_UPLOAD, + uploadID); + } else if (ex.getResult() == OMException.ResultCodes.PERMISSION_DENIED) { + throw S3ErrorTable.newError(S3ErrorTable.ACCESS_DENIED, + ozoneBucket.getName() + "/" + key); + } + throw ex; + } + } + +} diff --git a/hadoop-ozone/s3gateway/src/test/java/org/apache/hadoop/ozone/client/OzoneBucketStub.java b/hadoop-ozone/s3gateway/src/test/java/org/apache/hadoop/ozone/client/OzoneBucketStub.java index f9ac9a810665..9b671ac01a5e 100644 --- a/hadoop-ozone/s3gateway/src/test/java/org/apache/hadoop/ozone/client/OzoneBucketStub.java +++ b/hadoop-ozone/s3gateway/src/test/java/org/apache/hadoop/ozone/client/OzoneBucketStub.java @@ -22,6 +22,7 @@ import java.io.ByteArrayInputStream; import java.io.ByteArrayOutputStream; import java.io.IOException; +import java.nio.ByteBuffer; import java.util.ArrayList; import java.util.HashMap; import java.util.Iterator; @@ -38,7 +39,9 @@ import org.apache.hadoop.hdds.client.ReplicationType; import org.apache.hadoop.hdds.protocol.StorageType; import org.apache.hadoop.hdds.protocol.proto.HddsProtos; +import org.apache.hadoop.hdds.scm.storage.ByteBufferStreamOutput; import org.apache.hadoop.ozone.OzoneAcl; +import org.apache.hadoop.ozone.client.io.OzoneDataStreamOutput; import org.apache.hadoop.ozone.client.io.OzoneInputStream; import org.apache.hadoop.ozone.client.io.OzoneOutputStream; import org.apache.hadoop.ozone.client.OzoneMultipartUploadPartListParts.PartInfo; @@ -120,6 +123,54 @@ public void close() throws IOException { return new OzoneOutputStream(byteArrayOutputStream); } + + @Override + public OzoneDataStreamOutput createStreamKey(String key, long size, + ReplicationConfig rConfig, + Map keyMetadata) + throws IOException { + ByteBufferStreamOutput byteBufferStreamOutput = + new ByteBufferStreamOutput() { + + private final ByteBuffer buffer = ByteBuffer.allocate((int) size); + private final ReplicationFactor factor = + ReplicationFactor.valueOf(rConfig.getRequiredNodes()); + private final ReplicationType type = ReplicationType + .valueOf(rConfig.getReplicationType().toString()); + + @Override + public void close() throws IOException { + buffer.flip(); + byte[] bytes1 = new byte[buffer.remaining()]; + buffer.get(bytes1); + keyContents.put(key, bytes1); + keyDetails.put(key, new OzoneKeyDetails( + getVolumeName(), + getName(), + key, + size, + System.currentTimeMillis(), + System.currentTimeMillis(), + new ArrayList<>(), type, metadata, null, + factor.getValue() + )); + } + + @Override + public void write(ByteBuffer b, int off, int len) + throws IOException { + buffer.put(b.array(), off, len); + } + + @Override + public void flush() throws IOException { + } + }; + + return new OzoneDataStreamOutputStub(byteBufferStreamOutput, key + size); + } + + @Override public OzoneOutputStream createKey(String key, long size, ReplicationConfig rConfig, Map metadata) @@ -260,8 +311,55 @@ public void close() throws IOException { } @Override - public OmMultipartUploadCompleteInfo completeMultipartUpload(String key, - String uploadID, Map partsMap) throws IOException { + public OzoneDataStreamOutput createMultipartStreamKey(String key, + long size, + int partNumber, + String uploadID) + throws IOException { + String multipartUploadID = multipartUploadIdMap.get(key); + if (multipartUploadID == null || !multipartUploadID.equals(uploadID)) { + throw new OMException(ResultCodes.NO_SUCH_MULTIPART_UPLOAD_ERROR); + } else { + ByteBufferStreamOutput byteBufferStreamOutput = + new ByteBufferStreamOutput() { + private final ByteBuffer buffer = ByteBuffer.allocate(1024 * 1024); + + @Override + public void close() throws IOException { + int position = buffer.position(); + buffer.flip(); + byte[] bytes = new byte[position]; + buffer.get(bytes); + + Part part = new Part(key + size, bytes); + if (partList.get(key) == null) { + Map parts = new TreeMap<>(); + parts.put(partNumber, part); + partList.put(key, parts); + } else { + partList.get(key).put(partNumber, part); + } + } + + @Override + public void write(ByteBuffer b, int off, int len) + throws IOException { + buffer.put(b.array(), off, len); + } + + @Override + public void flush() throws IOException { + } + }; + + return new OzoneDataStreamOutputStub(byteBufferStreamOutput, key + size); + } + } + + @Override + public OmMultipartUploadCompleteInfo completeMultipartUpload( + String key, String uploadID, Map partsMap) + throws IOException { if (multipartUploadIdMap.get(key) == null) { throw new OMException(ResultCodes.NO_SUCH_MULTIPART_UPLOAD_ERROR); diff --git a/hadoop-ozone/s3gateway/src/test/java/org/apache/hadoop/ozone/client/OzoneDataStreamOutputStub.java b/hadoop-ozone/s3gateway/src/test/java/org/apache/hadoop/ozone/client/OzoneDataStreamOutputStub.java new file mode 100644 index 000000000000..7bb35682d8da --- /dev/null +++ b/hadoop-ozone/s3gateway/src/test/java/org/apache/hadoop/ozone/client/OzoneDataStreamOutputStub.java @@ -0,0 +1,70 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + * + */ + +package org.apache.hadoop.ozone.client; + +import org.apache.hadoop.hdds.scm.storage.ByteBufferStreamOutput; +import org.apache.hadoop.ozone.client.io.OzoneDataStreamOutput; +import org.apache.hadoop.ozone.om.helpers.OmMultipartCommitUploadPartInfo; + +import java.io.IOException; +import java.nio.ByteBuffer; + +/** + * OzoneDataStreamOutput stub for testing. + */ +public class OzoneDataStreamOutputStub extends OzoneDataStreamOutput { + + private final String partName; + private boolean closed = false; + + /** + * Constructs OzoneDataStreamOutputStub with streamOutput and partName. + */ + public OzoneDataStreamOutputStub( + ByteBufferStreamOutput byteBufferStreamOutput, + String partName) { + super(byteBufferStreamOutput); + this.partName = partName; + } + + @Override + public void write(ByteBuffer b, int off, int len) throws IOException { + getByteBufStreamOutput().write(b, off, len); + } + + @Override + public synchronized void flush() throws IOException { + getByteBufStreamOutput().flush(); + } + + @Override + public synchronized void close() throws IOException { + if (!closed) { + getByteBufStreamOutput().close(); + closed = true; + } + } + + @Override + public OmMultipartCommitUploadPartInfo getCommitUploadPartInfo() { + return closed ? new OmMultipartCommitUploadPartInfo(partName) : null; + } +} diff --git a/hadoop-ozone/s3gateway/src/test/java/org/apache/hadoop/ozone/s3/endpoint/TestMultipartStreamUploadWithCopy.java b/hadoop-ozone/s3gateway/src/test/java/org/apache/hadoop/ozone/s3/endpoint/TestMultipartStreamUploadWithCopy.java new file mode 100644 index 000000000000..586c2c380482 --- /dev/null +++ b/hadoop-ozone/s3gateway/src/test/java/org/apache/hadoop/ozone/s3/endpoint/TestMultipartStreamUploadWithCopy.java @@ -0,0 +1,432 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + * + */ + +package org.apache.hadoop.ozone.s3.endpoint; + +import org.apache.hadoop.hdds.client.ReplicationFactor; +import org.apache.hadoop.hdds.client.ReplicationType; +import org.apache.hadoop.hdds.conf.OzoneConfiguration; +import org.apache.hadoop.ozone.OzoneConfigKeys; +import org.apache.hadoop.ozone.OzoneConsts; +import org.apache.hadoop.ozone.client.OzoneBucket; +import org.apache.hadoop.ozone.client.OzoneClient; +import org.apache.hadoop.ozone.client.OzoneClientStub; +import org.apache.hadoop.ozone.s3.endpoint.CompleteMultipartUploadRequest.Part; +import org.apache.hadoop.ozone.s3.exception.OS3Exception; +import org.apache.hadoop.ozone.s3.exception.S3ErrorTable; +import org.apache.hadoop.ozone.web.utils.OzoneUtils; +import org.junit.Assert; +import org.junit.BeforeClass; +import org.junit.Test; +import org.mockito.Mockito; + +import javax.ws.rs.core.HttpHeaders; +import javax.ws.rs.core.Response; +import java.io.ByteArrayInputStream; +import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStream; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Scanner; + +import static java.nio.charset.StandardCharsets.UTF_8; +import static org.apache.hadoop.ozone.s3.util.S3Consts.COPY_SOURCE_HEADER; +import static org.apache.hadoop.ozone.s3.util.S3Consts.COPY_SOURCE_HEADER_RANGE; +import static org.apache.hadoop.ozone.s3.util.S3Consts.COPY_SOURCE_IF_MODIFIED_SINCE; +import static org.apache.hadoop.ozone.s3.util.S3Consts.COPY_SOURCE_IF_UNMODIFIED_SINCE; +import static org.apache.hadoop.ozone.s3.util.S3Consts.STORAGE_CLASS_HEADER; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertNotNull; +import static org.junit.Assert.assertTrue; +import static org.junit.Assert.fail; +import static org.mockito.Mockito.when; + +/** + * Class to test Multipart upload where parts are created with copy header. + */ + +public class TestMultipartStreamUploadWithCopy { + + private static final ObjectEndpoint REST = new ObjectEndpoint(); + + private static final String KEY = "MultipartStreamkey2"; + private static final String EXISTING_KEY = "MultipartStreamkey1"; + private static final String EXISTING_KEY_CONTENT = "MultipartStreamtestkey"; + private static final OzoneClient CLIENT = new OzoneClientStub(); + private static final long DELAY_MS = 2000; + private static long sourceKeyLastModificationTime; + private static String beforeSourceKeyModificationTimeStr; + private static String afterSourceKeyModificationTimeStr; + private static String futureTimeStr; + private static final String UNPARSABLE_TIME_STR = "Unparsable time string"; + private static final String ERROR_CODE = + S3ErrorTable.PRECOND_FAILED.getCode(); + @BeforeClass + public static void setUp() throws Exception { + CLIENT.getObjectStore().createS3Bucket(OzoneConsts.S3_BUCKET); + + OzoneBucket bucket = + CLIENT.getObjectStore().getS3Bucket(OzoneConsts.S3_BUCKET); + + byte[] keyContent = EXISTING_KEY_CONTENT.getBytes(UTF_8); + try (OutputStream stream = bucket + .createKey(EXISTING_KEY, keyContent.length, ReplicationType.RATIS, + ReplicationFactor.THREE, new HashMap<>())) { + stream.write(keyContent); + } + + sourceKeyLastModificationTime = CLIENT.getObjectStore() + .getS3Bucket(OzoneConsts.S3_BUCKET) + .getKey(EXISTING_KEY) + .getModificationTime().toEpochMilli(); + beforeSourceKeyModificationTimeStr = + OzoneUtils.formatTime(sourceKeyLastModificationTime - 1000); + afterSourceKeyModificationTimeStr = + OzoneUtils.formatTime(sourceKeyLastModificationTime + DELAY_MS); + futureTimeStr = + OzoneUtils.formatTime(sourceKeyLastModificationTime + + 1000 * 60 * 24); + + // Make sure DELAY_MS has passed, otherwise + // afterSourceKeyModificationTimeStr will be in the future + // and thus invalid + long currentTime = System.currentTimeMillis(); + long sleepMs = sourceKeyLastModificationTime + DELAY_MS - currentTime; + if (sleepMs > 0) { + Thread.sleep(sleepMs); + } + HttpHeaders headers = Mockito.mock(HttpHeaders.class); + when(headers.getHeaderString(STORAGE_CLASS_HEADER)).thenReturn( + "STANDARD"); + + REST.setHeaders(headers); + REST.setClient(CLIENT); + + OzoneConfiguration conf = new OzoneConfiguration(); + conf.setBoolean(OzoneConfigKeys.DFS_CONTAINER_RATIS_DATASTREAM_ENABLE, + true); + REST.setOzoneConfiguration(conf); + + REST.setDatastreamEnabled(true); + REST.init(); + } + + @Test + public void testEnableStream() { + assertTrue(REST.isDatastreamEnabled()); + } + + @Test + public void testMultipart() throws Exception { + // Initiate multipart upload + String uploadID = initiateMultipartUpload(KEY); + + List partsList = new ArrayList<>(); + + // Upload parts + String content = "Multipart Upload 1"; + + Part part1 = uploadPart(KEY, uploadID, 1, content); + partsList.add(part1); + + Part part2 = + uploadPartWithCopy(KEY, uploadID, 2, + OzoneConsts.S3_BUCKET + "/" + EXISTING_KEY, null); + partsList.add(part2); + + Part part3 = + uploadPartWithCopy(KEY, uploadID, 3, + OzoneConsts.S3_BUCKET + "/" + EXISTING_KEY, "bytes=0-3"); + partsList.add(part3); + + Part part4 = + uploadPartWithCopy(KEY, uploadID, 3, + OzoneConsts.S3_BUCKET + "/" + EXISTING_KEY, "bytes=0-3", + beforeSourceKeyModificationTimeStr, + afterSourceKeyModificationTimeStr + ); + partsList.add(part4); + + // complete multipart upload + CompleteMultipartUploadRequest completeMultipartUploadRequest = new + CompleteMultipartUploadRequest(); + completeMultipartUploadRequest.setPartList(partsList); + + completeMultipartUpload(KEY, completeMultipartUploadRequest, + uploadID); + + OzoneBucket bucket = + CLIENT.getObjectStore().getS3Bucket(OzoneConsts.S3_BUCKET); + try (InputStream is = bucket.readKey(KEY)) { + String keyContent = new Scanner(is, UTF_8.name()) + .useDelimiter("\\A").next(); + Assert.assertEquals( + content + EXISTING_KEY_CONTENT + EXISTING_KEY_CONTENT.substring(0, 4), + keyContent); + } + } + + /** + * CopyIfTimestampTestCase captures all the possibilities for the time stamps + * that can be passed into the multipart copy with copy-if flags for + * timestamps. Only some of the cases are valid others should raise an + * exception. + * Time stamps can be, + * 1. after the timestamp on the object but still a valid time stamp + * (in regard to wall clock time on server) + * 2. before the timestamp on the object + * 3. In the Future beyond the wall clock time on the server + * 4. Null + * 5. Unparsable + */ + public enum CopyIfTimestampTestCase { + MODIFIED_SINCE_AFTER_TS_UNMODIFIED_SINCE_AFTER_TS( + afterSourceKeyModificationTimeStr, afterSourceKeyModificationTimeStr, + ERROR_CODE), + MODIFIED_SINCE_AFTER_TS_UNMODIFIED_SINCE_BEFORE_TS( + afterSourceKeyModificationTimeStr, beforeSourceKeyModificationTimeStr, + ERROR_CODE), + MODIFIED_SINCE_AFTER_TS_UNMODIFIED_SINCE_NULL( + afterSourceKeyModificationTimeStr, null, + ERROR_CODE), + MODIFIED_SINCE_AFTER_TS_UNMODIFIED_SINCE_FUTURE( + afterSourceKeyModificationTimeStr, futureTimeStr, + ERROR_CODE), + MODIFIED_SINCE_AFTER_TS_UNMODIFIED_SINCE_UNPARSABLE_TS( + afterSourceKeyModificationTimeStr, UNPARSABLE_TIME_STR, + ERROR_CODE), + + MODIFIED_SINCE_BEFORE_TS_UNMODIFIED_SINCE_AFTER_TS( + beforeSourceKeyModificationTimeStr, afterSourceKeyModificationTimeStr, + null), + MODIFIED_SINCE_BEFORE_TS_UNMODIFIED_SINCE_BEFORE_TS( + beforeSourceKeyModificationTimeStr, beforeSourceKeyModificationTimeStr, + ERROR_CODE), + MODIFIED_SINCE_BEFORE_TS_UNMODIFIED_SINCE_NULL( + beforeSourceKeyModificationTimeStr, null, + null), + MODIFIED_SINCE_BEFORE_TS_UNMOFIFIED_SINCE_FUTURE( + beforeSourceKeyModificationTimeStr, futureTimeStr, + null), + MODIFIED_SINCE_BEFORE_TS_UNMODIFIED_SINCE_UNPARSABLE_TS( + beforeSourceKeyModificationTimeStr, UNPARSABLE_TIME_STR, + null), + + MODIFIED_SINCE_NULL_TS_UNMODIFIED_SINCE_AFTER_TS( + null, afterSourceKeyModificationTimeStr, + null), + MODIFIED_SINCE_NULL_TS_UNMODIFIED_SINCE_BEFORE_TS( + null, beforeSourceKeyModificationTimeStr, + ERROR_CODE), + MODIFIED_SINCE_NULL_TS_UNMODIFIED_SINCE_NULL_TS( + null, null, + null), + MODIFIED_SINCE_NULL_TS_UNMODIFIED_SINCE_FUTURE_TS( + null, futureTimeStr, + null), + MODIFIED_SINCE_NULL_TS_UNMODIFIED_SINCE_UNPARSABLE_TS( + null, UNPARSABLE_TIME_STR, + null), + + MODIFIED_SINCE_UNPARSABLE_TS_UNMODIFIED_SINCE_AFTER_TS( + UNPARSABLE_TIME_STR, afterSourceKeyModificationTimeStr, + null), + MODIFIED_SINCE_UNPARSABLE_TS_UNMODIFIED_SINCE_BEFORE_TS( + UNPARSABLE_TIME_STR, beforeSourceKeyModificationTimeStr, + ERROR_CODE), + MODIFIED_SINCE_UNPARSABLE_TS_UNMODIFIED_SINCE_NULL_TS( + UNPARSABLE_TIME_STR, null, + null), + MODIFIED_SINCE_UNPARSABLE_TS_UNMODIFIED_SINCE_FUTURE_TS( + UNPARSABLE_TIME_STR, futureTimeStr, + null), + MODIFIED_SINCE_UNPARSABLE_TS_UNMODIFIED_SINCE_UNPARSABLE_TS( + UNPARSABLE_TIME_STR, UNPARSABLE_TIME_STR, + null), + + MODIFIED_SINCE_FUTURE_TS_UNMODIFIED_SINCE_AFTER_TS( + futureTimeStr, afterSourceKeyModificationTimeStr, + null), + MODIFIED_SINCE_FUTURE_TS_UNMODIFIED_SINCE_BEFORE_TS( + futureTimeStr, beforeSourceKeyModificationTimeStr, + ERROR_CODE), + MODIFIED_SINCE_FUTURE_TS_UNMODIFIED_SINCE_NULL_TS( + futureTimeStr, null, + null), + MODIFIED_SINCE_FUTURE_TS_UNMODIFIED_SINCE_FUTURE_TS( + futureTimeStr, futureTimeStr, + null), + MODIFIED_SINCE_FUTURE_TS_UNMODIFIED_SINCE_UNPARSABLE_TS( + futureTimeStr, UNPARSABLE_TIME_STR, + null); + + private final String modifiedTimestamp; + private final String unmodifiedTimestamp; + private final String errorCode; + + CopyIfTimestampTestCase(String modifiedTimestamp, + String unmodifiedTimestamp, String errorCode) { + this.modifiedTimestamp = modifiedTimestamp; + this.unmodifiedTimestamp = unmodifiedTimestamp; + this.errorCode = errorCode; + } + + @Override + public String toString() { + return this.name() + + " Modified:" + this.modifiedTimestamp + + " Unmodified:" + this.unmodifiedTimestamp + + " ErrorCode:" + this.errorCode; + } + } + @Test + public void testMultipartTSHeaders() throws Exception { + for (CopyIfTimestampTestCase t : CopyIfTimestampTestCase.values()) { + try { + uploadPartWithCopy(t.modifiedTimestamp, t.unmodifiedTimestamp); + if (t.errorCode != null) { + fail("Fail test:" + t); + } + } catch (OS3Exception ex) { + if ((t.errorCode == null) || (!ex.getCode().equals(ERROR_CODE))) { + fail("Failed test:" + t); + } + } + } + } + + private String initiateMultipartUpload(String key) throws IOException, + OS3Exception { + setHeaders(); + Response response = REST.initializeMultipartUpload(OzoneConsts.S3_BUCKET, + key); + MultipartUploadInitiateResponse multipartUploadInitiateResponse = + (MultipartUploadInitiateResponse) response.getEntity(); + assertNotNull(multipartUploadInitiateResponse.getUploadID()); + String uploadID = multipartUploadInitiateResponse.getUploadID(); + + assertEquals(200, response.getStatus()); + + return uploadID; + + } + + private Part uploadPart(String key, String uploadID, int partNumber, String + content) throws IOException, OS3Exception { + setHeaders(); + ByteArrayInputStream body = + new ByteArrayInputStream(content.getBytes(UTF_8)); + Response response = REST.put(OzoneConsts.S3_BUCKET, key, content.length(), + partNumber, uploadID, body); + assertEquals(200, response.getStatus()); + assertNotNull(response.getHeaderString("ETag")); + Part part = new Part(); + part.seteTag(response.getHeaderString("ETag")); + part.setPartNumber(partNumber); + + return part; + } + + private Part uploadPartWithCopy(String key, String uploadID, int partNumber, + String keyOrigin, String range) throws IOException, OS3Exception { + return uploadPartWithCopy(key, uploadID, partNumber, keyOrigin, + range, null, null); + } + + private Part uploadPartWithCopy(String ifModifiedSinceStr, + String ifUnmodifiedSinceStr) throws IOException, OS3Exception { + // Initiate multipart upload + String uploadID = initiateMultipartUpload(KEY); + + return uploadPartWithCopy(KEY, uploadID, 1, + OzoneConsts.S3_BUCKET + "/" + EXISTING_KEY, "bytes=0-3", + ifModifiedSinceStr, ifUnmodifiedSinceStr); + } + + private Part uploadPartWithCopy(String key, String uploadID, int partNumber, + String keyOrigin, String range, String ifModifiedSinceStr, + String ifUnmodifiedSinceStr) throws IOException, OS3Exception { + Map additionalHeaders = new HashMap<>(); + additionalHeaders.put(COPY_SOURCE_HEADER, keyOrigin); + if (range != null) { + additionalHeaders.put(COPY_SOURCE_HEADER_RANGE, range); + } + if (ifModifiedSinceStr != null) { + additionalHeaders.put(COPY_SOURCE_IF_MODIFIED_SINCE, ifModifiedSinceStr); + } + if (ifUnmodifiedSinceStr != null) { + additionalHeaders.put(COPY_SOURCE_IF_UNMODIFIED_SINCE, + ifUnmodifiedSinceStr); + } + setHeaders(additionalHeaders); + + ByteArrayInputStream body = new ByteArrayInputStream("".getBytes(UTF_8)); + Response response = REST.put(OzoneConsts.S3_BUCKET, key, 0, partNumber, + uploadID, body); + assertEquals(200, response.getStatus()); + + CopyPartResult result = (CopyPartResult) response.getEntity(); + assertNotNull(result.getETag()); + assertNotNull(result.getLastModified()); + Part part = new Part(); + part.seteTag(result.getETag()); + part.setPartNumber(partNumber); + + return part; + } + + private void completeMultipartUpload(String key, + CompleteMultipartUploadRequest completeMultipartUploadRequest, + String uploadID) throws IOException, OS3Exception { + setHeaders(); + Response response = REST.completeMultipartUpload(OzoneConsts.S3_BUCKET, key, + uploadID, completeMultipartUploadRequest); + + assertEquals(200, response.getStatus()); + + CompleteMultipartUploadResponse completeMultipartUploadResponse = + (CompleteMultipartUploadResponse) response.getEntity(); + + assertEquals(OzoneConsts.S3_BUCKET, + completeMultipartUploadResponse.getBucket()); + assertEquals(KEY, completeMultipartUploadResponse.getKey()); + assertEquals(OzoneConsts.S3_BUCKET, + completeMultipartUploadResponse.getLocation()); + assertNotNull(completeMultipartUploadResponse.getETag()); + } + + private void setHeaders(Map additionalHeaders) { + HttpHeaders headers = Mockito.mock(HttpHeaders.class); + when(headers.getHeaderString(STORAGE_CLASS_HEADER)).thenReturn( + "STANDARD"); + + additionalHeaders + .forEach((k, v) -> when(headers.getHeaderString(k)).thenReturn(v)); + REST.setHeaders(headers); + } + + private void setHeaders() { + setHeaders(new HashMap<>()); + } + +} diff --git a/hadoop-ozone/s3gateway/src/test/java/org/apache/hadoop/ozone/s3/endpoint/TestObjectPutWithStream.java b/hadoop-ozone/s3gateway/src/test/java/org/apache/hadoop/ozone/s3/endpoint/TestObjectPutWithStream.java new file mode 100644 index 000000000000..f73dfb10fe2e --- /dev/null +++ b/hadoop-ozone/s3gateway/src/test/java/org/apache/hadoop/ozone/s3/endpoint/TestObjectPutWithStream.java @@ -0,0 +1,277 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + * + */ + +package org.apache.hadoop.ozone.s3.endpoint; + +import org.apache.commons.io.IOUtils; +import org.apache.hadoop.hdds.client.ReplicationType; +import org.apache.hadoop.hdds.conf.OzoneConfiguration; +import org.apache.hadoop.ozone.OzoneConfigKeys; +import org.apache.hadoop.ozone.client.OzoneClient; +import org.apache.hadoop.ozone.client.OzoneClientStub; +import org.apache.hadoop.ozone.client.OzoneKeyDetails; +import org.apache.hadoop.ozone.client.io.OzoneInputStream; +import org.apache.hadoop.ozone.s3.exception.OS3Exception; +import org.apache.hadoop.ozone.s3.exception.S3ErrorTable; +import org.junit.Assert; +import org.junit.Before; +import org.junit.Test; +import org.mockito.Mockito; + +import javax.ws.rs.core.HttpHeaders; +import javax.ws.rs.core.Response; +import java.io.ByteArrayInputStream; +import java.io.IOException; + +import static java.nio.charset.StandardCharsets.UTF_8; +import static org.apache.hadoop.ozone.s3.util.S3Consts.COPY_SOURCE_HEADER; +import static org.apache.hadoop.ozone.s3.util.S3Consts.STORAGE_CLASS_HEADER; +import static org.apache.hadoop.ozone.s3.util.S3Utils.urlEncode; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; +import static org.junit.Assert.fail; +import static org.mockito.Mockito.when; + +/** + * Test put object. + */ +public class TestObjectPutWithStream { + public static final String CONTENT = "0123456789"; + private String bucketName = "streamb1"; + private String keyName = "key=value/1"; + private String destBucket = "streamb2"; + private String destkey = "key=value/2"; + private String nonexist = "nonexist"; + private OzoneClient clientStub; + private ObjectEndpoint objectEndpoint; + + @Before + public void setup() throws IOException { + //Create client stub and object store stub. + clientStub = new OzoneClientStub(); + + // Create bucket + clientStub.getObjectStore().createS3Bucket(bucketName); + clientStub.getObjectStore().createS3Bucket(destBucket); + + // Create PutObject and setClient to OzoneClientStub + objectEndpoint = new ObjectEndpoint(); + objectEndpoint.setClient(clientStub); + + OzoneConfiguration conf = new OzoneConfiguration(); + conf.setBoolean(OzoneConfigKeys.DFS_CONTAINER_RATIS_DATASTREAM_ENABLE, + true); + objectEndpoint.setOzoneConfiguration(conf); + objectEndpoint.setDatastreamEnabled(true); + } + + @Test + public void testEnableStream() { + assertTrue(objectEndpoint.isDatastreamEnabled()); + } + + @Test + public void testPutObject() throws IOException, OS3Exception { + //GIVEN + HttpHeaders headers = Mockito.mock(HttpHeaders.class); + ByteArrayInputStream body = + new ByteArrayInputStream(CONTENT.getBytes(UTF_8)); + objectEndpoint.setHeaders(headers); + objectEndpoint.init(); + + //WHEN + Response response = objectEndpoint.put(bucketName, keyName, CONTENT + .length(), 1, null, body); + + + //THEN + OzoneInputStream ozoneInputStream = + clientStub.getObjectStore().getS3Bucket(bucketName) + .readKey(keyName); + String keyContent = + IOUtils.toString(ozoneInputStream, UTF_8); + + Assert.assertEquals(200, response.getStatus()); + Assert.assertEquals(CONTENT, keyContent); + } + + @Test + public void testPutObjectWithSignedChunks() throws IOException, OS3Exception { + //GIVEN + HttpHeaders headers = Mockito.mock(HttpHeaders.class); + objectEndpoint.setHeaders(headers); + objectEndpoint.init(); + + String chunkedContent = "0a;chunk-signature=signature\r\n" + + "1234567890\r\n" + + "05;chunk-signature=signature\r\n" + + "abcde\r\n"; + + when(headers.getHeaderString("x-amz-content-sha256")) + .thenReturn("STREAMING-AWS4-HMAC-SHA256-PAYLOAD"); + + //WHEN + Response response = objectEndpoint.put(bucketName, keyName, + chunkedContent.length(), 1, null, + new ByteArrayInputStream(chunkedContent.getBytes(UTF_8))); + + //THEN + OzoneInputStream ozoneInputStream = + clientStub.getObjectStore().getS3Bucket(bucketName) + .readKey(keyName); + String keyContent = IOUtils.toString(ozoneInputStream, UTF_8); + + Assert.assertEquals(200, response.getStatus()); + Assert.assertEquals("1234567890abcde", keyContent); + } + + @Test + public void testCopyObject() throws IOException, OS3Exception { + // Put object in to source bucket + HttpHeaders headers = Mockito.mock(HttpHeaders.class); + ByteArrayInputStream body = + new ByteArrayInputStream(CONTENT.getBytes(UTF_8)); + objectEndpoint.setHeaders(headers); + keyName = "sourceKey"; + objectEndpoint.init(); + + Response response = objectEndpoint.put(bucketName, keyName, + CONTENT.length(), 1, null, body); + + OzoneInputStream ozoneInputStream = clientStub.getObjectStore() + .getS3Bucket(bucketName) + .readKey(keyName); + + String keyContent = IOUtils.toString(ozoneInputStream, UTF_8); + + Assert.assertEquals(200, response.getStatus()); + Assert.assertEquals(CONTENT, keyContent); + + + // Add copy header, and then call put + when(headers.getHeaderString(COPY_SOURCE_HEADER)).thenReturn( + bucketName + "/" + urlEncode(keyName)); + + response = objectEndpoint.put(destBucket, destkey, CONTENT.length(), 1, + null, body); + + // Check destination key and response + ozoneInputStream = clientStub.getObjectStore().getS3Bucket(destBucket) + .readKey(destkey); + + keyContent = IOUtils.toString(ozoneInputStream, UTF_8); + + Assert.assertEquals(200, response.getStatus()); + Assert.assertEquals(CONTENT, keyContent); + + // source and dest same + try { + objectEndpoint.put(bucketName, keyName, CONTENT.length(), 1, null, body); + fail("test copy object failed"); + } catch (OS3Exception ex) { + Assert.assertTrue(ex.getErrorMessage().contains("This copy request is " + + "illegal")); + } + + // source bucket not found + try { + when(headers.getHeaderString(COPY_SOURCE_HEADER)).thenReturn( + nonexist + "/" + urlEncode(keyName)); + objectEndpoint.put(destBucket, destkey, CONTENT.length(), 1, null, + body); + fail("test copy object failed"); + } catch (OS3Exception ex) { + Assert.assertTrue(ex.getCode().contains("NoSuchBucket")); + } + + // dest bucket not found + try { + when(headers.getHeaderString(COPY_SOURCE_HEADER)).thenReturn( + bucketName + "/" + urlEncode(keyName)); + objectEndpoint.put(nonexist, destkey, CONTENT.length(), 1, null, body); + fail("test copy object failed"); + } catch (OS3Exception ex) { + Assert.assertTrue(ex.getCode().contains("NoSuchBucket")); + } + + //Both source and dest bucket not found + try { + when(headers.getHeaderString(COPY_SOURCE_HEADER)).thenReturn( + nonexist + "/" + urlEncode(keyName)); + objectEndpoint.put(nonexist, destkey, CONTENT.length(), 1, null, body); + fail("test copy object failed"); + } catch (OS3Exception ex) { + Assert.assertTrue(ex.getCode().contains("NoSuchBucket")); + } + + // source key not found + try { + when(headers.getHeaderString(COPY_SOURCE_HEADER)).thenReturn( + bucketName + "/" + urlEncode(nonexist)); + objectEndpoint.put("nonexistent", keyName, CONTENT.length(), 1, + null, body); + fail("test copy object failed"); + } catch (OS3Exception ex) { + Assert.assertTrue(ex.getCode().contains("NoSuchBucket")); + } + + } + + @Test + public void testInvalidStorageType() throws IOException { + HttpHeaders headers = Mockito.mock(HttpHeaders.class); + ByteArrayInputStream body = + new ByteArrayInputStream(CONTENT.getBytes(UTF_8)); + objectEndpoint.setHeaders(headers); + keyName = "sourceKey"; + when(headers.getHeaderString(STORAGE_CLASS_HEADER)).thenReturn("random"); + + try { + objectEndpoint.put(bucketName, keyName, + CONTENT.length(), 1, null, body); + fail("testInvalidStorageType"); + } catch (OS3Exception ex) { + assertEquals(S3ErrorTable.INVALID_ARGUMENT.getErrorMessage(), + ex.getErrorMessage()); + assertEquals("random", ex.getResource()); + } + } + + @Test + public void testEmptyStorageType() throws IOException, OS3Exception { + HttpHeaders headers = Mockito.mock(HttpHeaders.class); + ByteArrayInputStream body = + new ByteArrayInputStream(CONTENT.getBytes(UTF_8)); + objectEndpoint.setHeaders(headers); + objectEndpoint.init(); + keyName = "sourceKey"; + when(headers.getHeaderString(STORAGE_CLASS_HEADER)).thenReturn(""); + + objectEndpoint.put(bucketName, keyName, CONTENT + .length(), 1, null, body); + + OzoneKeyDetails key = + clientStub.getObjectStore().getS3Bucket(bucketName) + .getKey(keyName); + + //default type is set + Assert.assertEquals(ReplicationType.RATIS, key.getReplicationType()); + } +} diff --git a/hadoop-ozone/s3gateway/src/test/java/org/apache/hadoop/ozone/s3/endpoint/TestPartUploadWithStream.java b/hadoop-ozone/s3gateway/src/test/java/org/apache/hadoop/ozone/s3/endpoint/TestPartUploadWithStream.java new file mode 100644 index 000000000000..3a1b0f0ecf12 --- /dev/null +++ b/hadoop-ozone/s3gateway/src/test/java/org/apache/hadoop/ozone/s3/endpoint/TestPartUploadWithStream.java @@ -0,0 +1,147 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + * + */ + +package org.apache.hadoop.ozone.s3.endpoint; + +import org.apache.hadoop.hdds.conf.OzoneConfiguration; +import org.apache.hadoop.ozone.OzoneConfigKeys; +import org.apache.hadoop.ozone.client.OzoneClient; +import org.apache.hadoop.ozone.client.OzoneClientStub; +import org.apache.hadoop.ozone.s3.exception.OS3Exception; +import org.junit.BeforeClass; +import org.junit.Test; +import org.mockito.Mockito; + +import javax.ws.rs.core.HttpHeaders; +import javax.ws.rs.core.Response; +import java.io.ByteArrayInputStream; + +import static java.net.HttpURLConnection.HTTP_NOT_FOUND; +import static java.nio.charset.StandardCharsets.UTF_8; +import static org.apache.hadoop.ozone.s3.util.S3Consts.STORAGE_CLASS_HEADER; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertNotEquals; +import static org.junit.Assert.assertNotNull; +import static org.junit.Assert.assertTrue; +import static org.junit.Assert.fail; +import static org.mockito.Mockito.when; + +/** + * This class tests Upload part request. + */ +public class TestPartUploadWithStream { + + private static final ObjectEndpoint REST = new ObjectEndpoint(); + + private static final String S3BUCKET = "streampartb1"; + private static final String S3KEY = "testkey"; + + @BeforeClass + public static void setUp() throws Exception { + OzoneClient client = new OzoneClientStub(); + client.getObjectStore().createS3Bucket(S3BUCKET); + + + HttpHeaders headers = Mockito.mock(HttpHeaders.class); + when(headers.getHeaderString(STORAGE_CLASS_HEADER)).thenReturn("STANDARD"); + + REST.setHeaders(headers); + REST.setClient(client); + + OzoneConfiguration conf = new OzoneConfiguration(); + conf.setBoolean(OzoneConfigKeys.DFS_CONTAINER_RATIS_DATASTREAM_ENABLE, + true); + REST.setOzoneConfiguration(conf); + REST.setDatastreamEnabled(true); + REST.init(); + } + + @Test + public void testEnableStream() { + assertTrue(REST.isDatastreamEnabled()); + } + + @Test + public void testPartUpload() throws Exception { + + Response response = REST.initializeMultipartUpload(S3BUCKET, S3KEY); + MultipartUploadInitiateResponse multipartUploadInitiateResponse = + (MultipartUploadInitiateResponse) response.getEntity(); + assertNotNull(multipartUploadInitiateResponse.getUploadID()); + String uploadID = multipartUploadInitiateResponse.getUploadID(); + + assertEquals(200, response.getStatus()); + + String content = "Multipart Upload"; + ByteArrayInputStream body = + new ByteArrayInputStream(content.getBytes(UTF_8)); + response = REST.put(S3BUCKET, S3KEY, + content.length(), 1, uploadID, body); + + assertNotNull(response.getHeaderString("ETag")); + + } + + @Test + public void testPartUploadWithOverride() throws Exception { + + Response response = REST.initializeMultipartUpload(S3BUCKET, S3KEY); + MultipartUploadInitiateResponse multipartUploadInitiateResponse = + (MultipartUploadInitiateResponse) response.getEntity(); + assertNotNull(multipartUploadInitiateResponse.getUploadID()); + String uploadID = multipartUploadInitiateResponse.getUploadID(); + + assertEquals(200, response.getStatus()); + + String content = "Multipart Upload"; + ByteArrayInputStream body = + new ByteArrayInputStream(content.getBytes(UTF_8)); + response = REST.put(S3BUCKET, S3KEY, + content.length(), 1, uploadID, body); + + assertNotNull(response.getHeaderString("ETag")); + + String eTag = response.getHeaderString("ETag"); + + // Upload part again with same part Number, the ETag should be changed. + content = "Multipart Upload Changed"; + response = REST.put(S3BUCKET, S3KEY, + content.length(), 1, uploadID, body); + assertNotNull(response.getHeaderString("ETag")); + assertNotEquals(eTag, response.getHeaderString("ETag")); + + } + + + @Test + public void testPartUploadWithIncorrectUploadID() throws Exception { + try { + String content = "Multipart Upload With Incorrect uploadID"; + ByteArrayInputStream body = + new ByteArrayInputStream(content.getBytes(UTF_8)); + REST.put(S3BUCKET, S3KEY, content.length(), 1, + "random", body); + fail("testPartUploadWithIncorrectUploadID failed"); + } catch (OS3Exception ex) { + assertEquals("NoSuchUpload", ex.getCode()); + assertEquals(HTTP_NOT_FOUND, ex.getHttpCode()); + } + } +} diff --git a/hadoop-ozone/tools/src/main/java/org/apache/hadoop/ozone/freon/ContentGenerator.java b/hadoop-ozone/tools/src/main/java/org/apache/hadoop/ozone/freon/ContentGenerator.java index 92f7ae4b2ecd..b01c12f6b354 100644 --- a/hadoop-ozone/tools/src/main/java/org/apache/hadoop/ozone/freon/ContentGenerator.java +++ b/hadoop-ozone/tools/src/main/java/org/apache/hadoop/ozone/freon/ContentGenerator.java @@ -18,10 +18,12 @@ import java.io.IOException; import java.io.OutputStream; +import java.nio.ByteBuffer; import java.nio.charset.StandardCharsets; import com.google.common.annotations.VisibleForTesting; import org.apache.commons.lang3.RandomStringUtils; +import org.apache.hadoop.ozone.client.io.OzoneDataStreamOutput; /** * Utility class to write random keys from a limited buffer. @@ -81,6 +83,22 @@ public void write(OutputStream outputStream) throws IOException { } } + /** + * Write the required bytes to the streaming output stream. + */ + public void write(OzoneDataStreamOutput out) throws IOException { + for (long nrRemaining = keySize; + nrRemaining > 0; nrRemaining -= bufferSize) { + int curSize = (int) Math.min(bufferSize, nrRemaining); + for (int i = 0; i < curSize; i += copyBufferSize) { + ByteBuffer bb = + ByteBuffer.wrap(buffer, i, Math.min(copyBufferSize, curSize - i)); + out.write(bb); + } + } + out.close(); + } + @VisibleForTesting byte[] getBuffer() { return buffer; diff --git a/hadoop-ozone/tools/src/main/java/org/apache/hadoop/ozone/freon/OzoneClientKeyGenerator.java b/hadoop-ozone/tools/src/main/java/org/apache/hadoop/ozone/freon/OzoneClientKeyGenerator.java index 74cd0d0b3737..43cdfcfc50d8 100644 --- a/hadoop-ozone/tools/src/main/java/org/apache/hadoop/ozone/freon/OzoneClientKeyGenerator.java +++ b/hadoop-ozone/tools/src/main/java/org/apache/hadoop/ozone/freon/OzoneClientKeyGenerator.java @@ -24,10 +24,13 @@ import org.apache.hadoop.hdds.cli.HddsVersionProvider; import org.apache.hadoop.hdds.client.ReplicationConfig; import org.apache.hadoop.hdds.conf.OzoneConfiguration; +import org.apache.hadoop.hdds.protocol.proto.HddsProtos.ReplicationFactor; +import org.apache.hadoop.hdds.protocol.proto.HddsProtos.ReplicationType; import org.apache.hadoop.ozone.client.OzoneBucket; import org.apache.hadoop.ozone.client.OzoneClient; import com.codahale.metrics.Timer; +import org.apache.hadoop.ozone.client.io.OzoneDataStreamOutput; import picocli.CommandLine.Command; import picocli.CommandLine.Mixin; import picocli.CommandLine.Option; @@ -74,6 +77,12 @@ public class OzoneClientKeyGenerator extends BaseFreonGenerator @Mixin private FreonReplicationOptions replication; + @Option( + names = {"--enable-streaming", "--stream"}, + description = "Specify whether the write will be through ratis streaming" + ) + private boolean enableRatisStreaming = false; + private Timer timer; private OzoneBucket bucket; @@ -101,7 +110,11 @@ public Void call() throws Exception { timer = getMetrics().timer("key-create"); - runTests(this::createKey); + if (enableRatisStreaming) { + runTests(this::createStreamKey); + } else { + runTests(this::createKey); + } } return null; } @@ -118,4 +131,18 @@ private void createKey(long counter) throws Exception { return null; }); } + + private void createStreamKey(long counter) throws Exception { + final ReplicationConfig conf = ReplicationConfig.fromProtoTypeAndFactor( + ReplicationType.RATIS, ReplicationFactor.THREE); + final String key = generateObjectName(counter); + + timer.time(() -> { + try (OzoneDataStreamOutput stream = bucket.createStreamKey( + key, keySize, conf, metadata)) { + contentGenerator.write(stream); + } + return null; + }); + } } diff --git a/hadoop-ozone/tools/src/main/java/org/apache/hadoop/ozone/shell/keys/PutKeyHandler.java b/hadoop-ozone/tools/src/main/java/org/apache/hadoop/ozone/shell/keys/PutKeyHandler.java index 7d7885d168d9..1f7c1ef7f49a 100644 --- a/hadoop-ozone/tools/src/main/java/org/apache/hadoop/ozone/shell/keys/PutKeyHandler.java +++ b/hadoop-ozone/tools/src/main/java/org/apache/hadoop/ozone/shell/keys/PutKeyHandler.java @@ -23,10 +23,14 @@ import java.io.IOException; import java.io.InputStream; import java.io.OutputStream; +import java.io.RandomAccessFile; +import java.nio.ByteBuffer; +import java.nio.channels.FileChannel; import java.util.HashMap; import java.util.Map; import org.apache.hadoop.conf.StorageUnit; +import org.apache.hadoop.hdds.client.ECReplicationConfig; import org.apache.hadoop.hdds.client.ReplicationConfig; import org.apache.hadoop.io.IOUtils; import org.apache.hadoop.ozone.OzoneConsts; @@ -34,9 +38,11 @@ import org.apache.hadoop.ozone.client.OzoneClient; import org.apache.hadoop.ozone.client.OzoneClientException; import org.apache.hadoop.ozone.client.OzoneVolume; +import org.apache.hadoop.ozone.client.io.OzoneDataStreamOutput; import org.apache.hadoop.ozone.shell.OzoneAddress; import org.apache.commons.codec.digest.DigestUtils; +import static org.apache.hadoop.hdds.protocol.proto.HddsProtos.ReplicationType.EC; import static org.apache.hadoop.hdds.scm.ScmConfigKeys.OZONE_SCM_CHUNK_SIZE_DEFAULT; import static org.apache.hadoop.hdds.scm.ScmConfigKeys.OZONE_SCM_CHUNK_SIZE_KEY; @@ -89,10 +95,41 @@ protected void execute(OzoneClient client, OzoneAddress address) int chunkSize = (int) getConf().getStorageSize(OZONE_SCM_CHUNK_SIZE_KEY, OZONE_SCM_CHUNK_SIZE_DEFAULT, StorageUnit.BYTES); - try (InputStream input = new FileInputStream(dataFile); - OutputStream output = bucket.createKey(keyName, dataFile.length(), - replicationConfig, keyMetadata)) { - IOUtils.copyBytes(input, output, chunkSize); + + Boolean useAsync = false; + if (dataFile.length() <= chunkSize || + (replicationConfig != null && + replicationConfig.getReplicationType() == EC) || + bucket.getReplicationConfig() instanceof ECReplicationConfig) { + useAsync = true; + } + if (useAsync) { + if (isVerbose()) { + out().println("API: async"); + } + try (InputStream input = new FileInputStream(dataFile); + OutputStream output = bucket.createKey(keyName, dataFile.length(), + replicationConfig, keyMetadata)) { + IOUtils.copyBytes(input, output, chunkSize); + } + } else { + if (isVerbose()) { + out().println("API: streaming"); + } + try (RandomAccessFile raf = new RandomAccessFile(dataFile, "r"); + OzoneDataStreamOutput out = bucket.createStreamKey(keyName, + dataFile.length(), replicationConfig, keyMetadata)) { + FileChannel ch = raf.getChannel(); + long len = raf.length(); + long off = 0; + while (len > 0) { + long writeLen = Math.min(len, chunkSize); + ByteBuffer bb = ch.map(FileChannel.MapMode.READ_ONLY, off, writeLen); + out.write(bb); + off += writeLen; + len -= writeLen; + } + } } }