Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@

package org.apache.hadoop.ozone;

import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.Preconditions;
import com.google.protobuf.ServiceException;
import java.io.File;
import java.io.IOException;
Expand Down Expand Up @@ -76,6 +78,18 @@ public final class OmUtils {
private static final SecureRandom SRAND = new SecureRandom();
private static byte[] randomBytes = new byte[32];

private static final long TRANSACTION_ID_SHIFT = 8;
// from the 64 bits of ObjectID (long variable), 2 bits are reserved for
// epoch and 8 bits for recursive directory creation, if required. This
// leaves 54 bits for the transaction ID. Also, the last transaction ID is
// reserved for creating S3G volume on OM start {@link
// OzoneManager#addS3GVolumeToDB()}.
public static final long EPOCH_ID_SHIFT = 62; // 64 - 2
public static final long REVERSE_EPOCH_ID_SHIFT = 2; // 64 - EPOCH_ID_SHIFT
public static final long MAX_TRXN_ID = (long) ((1 << 54) - 2);
public static final int EPOCH_WHEN_RATIS_NOT_ENABLED = 1;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Don't we want these values to be 0 and 1 instead of 1 & 2 ?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Wanted to avoid 0 as we can assume that currently it is 0. This would give us a way to separate out objectIds created before this fix. If ever, these non-unique objectIds need to be fixed, it would be easy to identify them.

public static final int EPOCH_WHEN_RATIS_ENABLED = 2;

private OmUtils() {
}

Expand Down Expand Up @@ -360,8 +374,6 @@ public static Collection<String> emptyAsSingletonNull(Collection<String>
}
}



/**
* If a OM conf is only set with key suffixed with OM Node ID, return the
* set value.
Expand Down Expand Up @@ -524,6 +536,49 @@ public static OmKeyInfo prepareKeyForRecover(OmKeyInfo keyInfo,
}
}

public static int getOMEpoch(boolean isRatisEnabled) {
return isRatisEnabled ? EPOCH_WHEN_RATIS_ENABLED :
EPOCH_WHEN_RATIS_NOT_ENABLED;
}

/**
* Get the valid base object id given the transaction id.
* @param epoch a 2 bit epoch number. The 2 most significant bits of the
* object will be set to this epoch.
* @param txId of the transaction. This value cannot exceed 2^54 - 1 as
* out of the 64 bits for a long, 2 are reserved for the epoch
* and 8 for recursive directory creation.
* @return base object id allocated against the transaction
*/
public static long getObjectIdFromTxId(long epoch, long txId) {
Preconditions.checkArgument(txId <= MAX_TRXN_ID, "TransactionID " +
"exceeds max limit of " + MAX_TRXN_ID);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I am thinking for this extreme case, user cannot write object anymore when TransactionID exceeds MAX_TRXN_ID, right? So what can we do for this, have to setup a new Ozone cluster to use?

return addEpochToTxId(epoch, txId);
}

/**
* Note - This function should not be called directly. It is directly called
* only from OzoneManager#addS3GVolumeToDB() which is a one time operation
* when OM is started first time to add S3G volume. In call other cases,
* getObjectIdFromTxId() should be called to append epoch to objectID.
*/
public static long addEpochToTxId(long epoch, long txId) {
long lsb54 = txId << TRANSACTION_ID_SHIFT;
long msb2 = epoch << EPOCH_ID_SHIFT;

return msb2 | lsb54;
}

/**
* Given an objectId, unset the 2 most significant bits to get the
* corresponding transaction index.
*/
@VisibleForTesting
public static long getTxIdFromObjectId(long objectId) {
return ((Long.MAX_VALUE >> REVERSE_EPOCH_ID_SHIFT) & objectId)
>> TRANSACTION_ID_SHIFT;
}

/**
* Verify key name is a valid name.
*/
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,6 @@ public class ExitManager {

public void exitSystem(int status, String message, Throwable throwable,
Logger log) {
ExitUtils.terminate(1, message, throwable, log);
ExitUtils.terminate(status, message, throwable, log);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@
import org.apache.hadoop.hdds.scm.protocolPB.StorageContainerLocationProtocolClientSideTranslatorPB;
import org.apache.hadoop.ozone.HddsDatanodeService;
import org.apache.hadoop.ozone.MiniOzoneCluster;
import org.apache.hadoop.ozone.OmUtils;
import org.apache.hadoop.ozone.OzoneAcl;
import org.apache.hadoop.ozone.OzoneConfigKeys;
import org.apache.hadoop.ozone.OzoneConsts;
Expand Down Expand Up @@ -111,6 +112,7 @@
import static org.apache.hadoop.hdds.client.ReplicationFactor.ONE;
import static org.apache.hadoop.hdds.client.ReplicationFactor.THREE;
import static org.apache.hadoop.hdds.client.ReplicationType.STAND_ALONE;
import static org.apache.hadoop.ozone.OmUtils.MAX_TRXN_ID;
import static org.apache.hadoop.ozone.OzoneAcl.AclScope.ACCESS;
import static org.apache.hadoop.ozone.OzoneAcl.AclScope.DEFAULT;
import static org.apache.hadoop.ozone.OzoneConfigKeys.OZONE_SCM_BLOCK_SIZE;
Expand Down Expand Up @@ -254,8 +256,9 @@ public void testDefaultS3GVolumeExists() throws Exception {
Assert.assertEquals(ozoneVolume.getName(), s3VolumeName);
OMMetadataManager omMetadataManager =
cluster.getOzoneManager().getMetadataManager();
long transactionID = Long.MAX_VALUE -1 >> 8;
long objectID = transactionID << 8;
long transactionID = MAX_TRXN_ID + 1;
long objectID = OmUtils.addEpochToTxId(omMetadataManager.getOmEpoch(),
transactionID);
OmVolumeArgs omVolumeArgs =
cluster.getOzoneManager().getMetadataManager().getVolumeTable().get(
omMetadataManager.getVolumeKey(s3VolumeName));
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,17 +26,27 @@
import org.apache.hadoop.hdds.client.ReplicationType;
import org.apache.hadoop.hdds.conf.OzoneConfiguration;
import org.apache.hadoop.ozone.MiniOzoneCluster;
import org.apache.hadoop.ozone.OmUtils;
import org.apache.hadoop.ozone.client.ObjectStore;
import org.apache.hadoop.ozone.client.OzoneBucket;
import org.apache.hadoop.ozone.client.OzoneClient;
import org.apache.hadoop.ozone.client.OzoneKey;
import org.apache.hadoop.ozone.client.OzoneVolume;
import org.apache.hadoop.ozone.client.io.OzoneOutputStream;
import org.apache.hadoop.ozone.om.helpers.OmBucketInfo;
import org.apache.hadoop.ozone.om.helpers.OmKeyArgs;
import org.apache.hadoop.ozone.om.helpers.OmKeyInfo;
import org.apache.hadoop.ozone.om.helpers.OmVolumeArgs;
import org.apache.hadoop.ozone.om.protocolPB.OmTransportFactory;
import org.apache.hadoop.ozone.om.protocolPB.OzoneManagerProtocolClientSideTranslatorPB;
import org.apache.hadoop.security.UserGroupInformation;
import org.apache.hadoop.test.GenericTestUtils;

import org.apache.commons.lang3.RandomStringUtils;

import static org.apache.hadoop.hdds.scm.ScmConfigKeys.OZONE_SCM_RATIS_PIPELINE_LIMIT;
import static org.apache.hadoop.ozone.OmUtils.EPOCH_ID_SHIFT;
import static org.apache.hadoop.ozone.OmUtils.EPOCH_WHEN_RATIS_NOT_ENABLED;
import static org.apache.hadoop.ozone.OzoneConfigKeys.OZONE_ACL_ENABLED;
import static org.apache.hadoop.ozone.OzoneConfigKeys.OZONE_ADMINISTRATORS;
import static org.apache.hadoop.ozone.OzoneConfigKeys.OZONE_ADMINISTRATORS_WILDCARD;
Expand Down Expand Up @@ -104,7 +114,6 @@ public void testRestartOMWithVolumeOperation() throws Exception {
String volumeName = "volume" + RandomStringUtils.randomNumeric(5);

OzoneClient client = cluster.getClient();

ObjectStore objectStore = client.getObjectStore();

objectStore.createVolume(volumeName);
Expand Down Expand Up @@ -209,5 +218,94 @@ public void testRestartOMWithKeyOperation() throws Exception {
ReplicationType.RATIS));
}

@Test
public void testUniqueTrxnIndexOnOMRestart() throws Exception {
// When OM is restarted, the transaction index for requests should not
// start from 0. It should incrementally increase from the last
// transaction index which was stored in DB before restart.

String volumeName = "volume" + RandomStringUtils.randomNumeric(5);
String bucketName = "bucket" + RandomStringUtils.randomNumeric(5);
String keyName = "key" + RandomStringUtils.randomNumeric(5);

OzoneManager om = cluster.getOzoneManager();
OzoneClient client = cluster.getClient();
ObjectStore objectStore = client.getObjectStore();

UserGroupInformation ugi = UserGroupInformation.getCurrentUser();
OzoneManagerProtocolClientSideTranslatorPB omClient =
new OzoneManagerProtocolClientSideTranslatorPB(
OmTransportFactory.create(conf, ugi, null),
RandomStringUtils.randomAscii(5));

objectStore.createVolume(volumeName);

// Verify that the last transactionIndex stored in DB after volume
// creation equals the transaction index corresponding to volume's
// objectID. Also, the volume transaction index should be 1 as this is
// the first transaction in this cluster.
OmVolumeArgs volumeInfo = omClient.getVolumeInfo(volumeName);
long volumeTrxnIndex = OmUtils.getTxIdFromObjectId(
volumeInfo.getObjectID());
Assert.assertEquals(1, volumeTrxnIndex);
Assert.assertEquals(volumeTrxnIndex, om.getLastTrxnIndexForNonRatis());

OzoneVolume ozoneVolume = objectStore.getVolume(volumeName);
ozoneVolume.createBucket(bucketName);

// Verify last transactionIndex is updated after bucket creation
OmBucketInfo bucketInfo = omClient.getBucketInfo(volumeName, bucketName);
long bucketTrxnIndex = OmUtils.getTxIdFromObjectId(
bucketInfo.getObjectID());
Assert.assertEquals(2, bucketTrxnIndex);
Assert.assertEquals(bucketTrxnIndex, om.getLastTrxnIndexForNonRatis());

// Restart the OM and create new object
cluster.restartOzoneManager();

String data = "random data";
OzoneOutputStream ozoneOutputStream = ozoneVolume.getBucket(bucketName)
.createKey(keyName, data.length(), ReplicationType.RATIS,
ReplicationFactor.ONE, new HashMap<>());
ozoneOutputStream.write(data.getBytes(), 0, data.length());
ozoneOutputStream.close();

// Verify last transactionIndex is updated after key creation and the
// transaction index after restart is incremented from the last
// transaction index before restart.
OmKeyInfo omKeyInfo = omClient.lookupKey(new OmKeyArgs.Builder()
.setVolumeName(volumeName)
.setBucketName(bucketName)
.setKeyName(keyName)
.setRefreshPipeline(true).build());
long keyTrxnIndex = OmUtils.getTxIdFromObjectId(
omKeyInfo.getObjectID());
Assert.assertEquals(3, keyTrxnIndex);
// Key commit is a separate transaction. Hence, the last trxn index in DB
// should be 1 more than KeyTrxnIndex
Assert.assertEquals(4, om.getLastTrxnIndexForNonRatis());
}

@Test
public void testEpochIntegrationInObjectID() throws Exception {
// Create a volume and check the objectID has the epoch as
// EPOCH_FOR_RATIS_NOT_ENABLED in the first 2 bits.

OzoneClient client = cluster.getClient();
ObjectStore objectStore = client.getObjectStore();

String volumeName = "volume" + RandomStringUtils.randomNumeric(5);
objectStore.createVolume(volumeName);

UserGroupInformation ugi = UserGroupInformation.getCurrentUser();
OzoneManagerProtocolClientSideTranslatorPB omClient =
new OzoneManagerProtocolClientSideTranslatorPB(
OmTransportFactory.create(conf, ugi, null),
RandomStringUtils.randomAscii(5));

long volObjId = omClient.getVolumeInfo(volumeName).getObjectID();
long epochInVolObjId = volObjId >> EPOCH_ID_SHIFT;

Assert.assertEquals(EPOCH_WHEN_RATIS_NOT_ENABLED, epochInVolObjId);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,11 @@ public interface OMMetadataManager {
*/
OzoneManagerLock getLock();

/**
* Returns the epoch associated with current OM process.
*/
long getOmEpoch();

/**
* Given a volume return the corresponding DB key.
*
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -34,8 +34,12 @@
*/
public final class OMTransactionInfo {

// Term associated with Ratis Log index in Ratis enabled cluster. In
// non-Ratis cluster, term is set to -1.
private long term; // term associated with the ratis log index.
// Transaction index corresponds to ratis log index
// Ratis Log index in Ratis enabled cluster or the unique transaction
// index {@link OzoneManagerServerSideTransalatorPB#transactionIndex} in
// non-Ratis cluster
private long transactionIndex;

private OMTransactionInfo(String transactionInfo) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@
import org.apache.hadoop.hdds.utils.db.cache.CacheKey;
import org.apache.hadoop.hdds.utils.db.cache.CacheValue;
import org.apache.hadoop.hdds.utils.db.cache.TableCacheImpl;
import org.apache.hadoop.ozone.OmUtils;
import org.apache.hadoop.ozone.OzoneConsts;
import org.apache.hadoop.ozone.common.BlockGroup;
import org.apache.hadoop.ozone.om.codec.OMTransactionInfoCodec;
Expand Down Expand Up @@ -162,6 +163,15 @@ public class OmMetadataManagerImpl implements OMMetadataManager {
private boolean isRatisEnabled;
private boolean ignorePipelineinKey;

// Epoch is used to generate the objectIDs. The most significant 2 bits of
// objectIDs is set to this epoch. For clusters before HDDS-4315 there is
// no epoch as such. But it can be safely assumed that the most significant
// 2 bits of the objectID will be 00. From HDDS-4315 onwards, the Epoch for
// non-ratis OM clusters will be binary 01 (= decimal 1) and for ratis
// enabled OM cluster will be binary 10 (= decimal 2). This epoch is added
// to ensure uniqueness of objectIDs.
private final long omEpoch;

private Map<String, Table> tableMap = new HashMap<>();

public OmMetadataManagerImpl(OzoneConfiguration conf) throws IOException {
Expand All @@ -176,6 +186,7 @@ public OmMetadataManagerImpl(OzoneConfiguration conf) throws IOException {
isRatisEnabled = conf.getBoolean(
OMConfigKeys.OZONE_OM_RATIS_ENABLE_KEY,
OMConfigKeys.OZONE_OM_RATIS_ENABLE_DEFAULT);
this.omEpoch = OmUtils.getOMEpoch(isRatisEnabled);
// For test purpose only
ignorePipelineinKey = conf.getBoolean(
"ozone.om.ignore.pipeline", Boolean.TRUE);
Expand All @@ -189,6 +200,7 @@ protected OmMetadataManagerImpl() {
this.lock = new OzoneManagerLock(new OzoneConfiguration());
this.openKeyExpireThresholdMS =
OZONE_OPEN_KEY_EXPIRE_THRESHOLD_SECONDS_DEFAULT;
this.omEpoch = 0;
}

@Override
Expand Down Expand Up @@ -235,7 +247,6 @@ public Table<String, OmMultipartKeyInfo> getMultipartInfoTable() {
return multipartInfoTable;
}


private void checkTableStatus(Table table, String name) throws IOException {
String logMessage = "Unable to get a reference to %s table. Cannot " +
"continue.";
Expand Down Expand Up @@ -498,6 +509,11 @@ public org.apache.hadoop.ozone.om.lock.OzoneManagerLock getLock() {
return lock;
}

@Override
public long getOmEpoch() {
return omEpoch;
}

/**
* Returns true if the firstArray startsWith the bytes of secondArray.
*
Expand Down
Loading