Skip to content

Commit b6a1226

Browse files
hemantk-12adoroszlai
authored andcommitted
HDDS-9486. Fix deadlock causing intermittent fork timeout in TestSnapshotBackgroundServices (#6026)
(cherry picked from commit 2c0580d)
1 parent 9954d41 commit b6a1226

File tree

4 files changed

+63
-70
lines changed

4 files changed

+63
-70
lines changed

hadoop-hdds/rocksdb-checkpoint-differ/src/main/java/org/apache/ozone/rocksdiff/RocksDBCheckpointDiffer.java

Lines changed: 18 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1421,16 +1421,21 @@ public String getCompactionLogDir() {
14211421
* those are not needed to generate snapshot diff. These files are basically
14221422
* non-leaf nodes of the DAG.
14231423
*/
1424-
public synchronized void pruneSstFiles() {
1424+
public void pruneSstFiles() {
14251425
if (!shouldRun()) {
14261426
return;
14271427
}
14281428

14291429
Set<String> nonLeafSstFiles;
1430-
nonLeafSstFiles = forwardCompactionDAG.nodes().stream()
1431-
.filter(node -> !forwardCompactionDAG.successors(node).isEmpty())
1432-
.map(node -> node.getFileName())
1433-
.collect(Collectors.toSet());
1430+
// This is synchronized because compaction thread can update the compactionDAG and can be in situation
1431+
// when nodes are added to the graph, but arcs are still in progress.
1432+
// Hence, the lock is taken.
1433+
synchronized (this) {
1434+
nonLeafSstFiles = forwardCompactionDAG.nodes().stream()
1435+
.filter(node -> !forwardCompactionDAG.successors(node).isEmpty())
1436+
.map(node -> node.getFileName())
1437+
.collect(Collectors.toSet());
1438+
}
14341439

14351440
if (CollectionUtils.isNotEmpty(nonLeafSstFiles)) {
14361441
LOG.info("Removing SST files: {} as part of SST file pruning.",
@@ -1448,8 +1453,13 @@ public void incrementTarballRequestCount() {
14481453
tarballRequestCount.incrementAndGet();
14491454
}
14501455

1451-
public void decrementTarballRequestCount() {
1452-
tarballRequestCount.decrementAndGet();
1456+
public void decrementTarballRequestCountAndNotify() {
1457+
// Synchronized block is used to ensure that lock is on the same instance notifyAll is being called.
1458+
synchronized (this) {
1459+
tarballRequestCount.decrementAndGet();
1460+
// Notify compaction threads to continue.
1461+
notifyAll();
1462+
}
14531463
}
14541464

14551465
public boolean shouldRun() {
@@ -1517,8 +1527,7 @@ public static RocksDBCheckpointDiffer getInstance(
15171527
* for cache.
15181528
*/
15191529
public static void invalidateCacheEntry(String cacheKey) {
1520-
IOUtils.closeQuietly(INSTANCE_MAP.get(cacheKey));
1521-
INSTANCE_MAP.remove(cacheKey);
1530+
IOUtils.close(LOG, INSTANCE_MAP.remove(cacheKey));
15221531
}
15231532
}
15241533

hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/om/TestOzoneManagerHA.java

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -216,13 +216,16 @@ public static void shutdown() {
216216
*/
217217
public static String createKey(OzoneBucket ozoneBucket) throws IOException {
218218
String keyName = "key" + RandomStringUtils.randomNumeric(5);
219+
createKey(ozoneBucket, keyName);
220+
return keyName;
221+
}
222+
223+
public static void createKey(OzoneBucket ozoneBucket, String keyName) throws IOException {
219224
String data = "data" + RandomStringUtils.randomNumeric(5);
220-
OzoneOutputStream ozoneOutputStream = ozoneBucket.createKey(keyName,
221-
data.length(), ReplicationType.RATIS,
225+
OzoneOutputStream ozoneOutputStream = ozoneBucket.createKey(keyName, data.length(), ReplicationType.RATIS,
222226
ReplicationFactor.ONE, new HashMap<>());
223227
ozoneOutputStream.write(data.getBytes(UTF_8), 0, data.length());
224228
ozoneOutputStream.close();
225-
return keyName;
226229
}
227230

228231
protected OzoneBucket setupBucket() throws Exception {

hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/om/TestSnapshotBackgroundServices.java

Lines changed: 35 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,6 @@
1616
*/
1717
package org.apache.hadoop.ozone.om;
1818

19-
import org.apache.commons.lang3.RandomStringUtils;
2019
import org.apache.hadoop.hdds.conf.OzoneConfiguration;
2120
import org.apache.hadoop.hdds.conf.StorageUnit;
2221
import org.apache.hadoop.hdds.utils.IOUtils;
@@ -40,6 +39,7 @@
4039
import org.apache.hadoop.ozone.om.helpers.BucketLayout;
4140
import org.apache.hadoop.ozone.om.helpers.OmKeyInfo;
4241
import org.apache.hadoop.ozone.om.helpers.SnapshotInfo;
42+
import org.apache.hadoop.ozone.om.ratis.OzoneManagerRatisServerConfig;
4343
import org.apache.hadoop.ozone.om.snapshot.ReferenceCounted;
4444
import org.apache.hadoop.ozone.om.snapshot.SnapshotCache;
4545
import org.apache.hadoop.ozone.snapshot.SnapshotDiffReportOzone;
@@ -48,7 +48,6 @@
4848
import org.apache.ozone.rocksdiff.CompactionNode;
4949
import org.apache.ozone.test.GenericTestUtils;
5050
import org.apache.ozone.test.LambdaTestUtils;
51-
import org.apache.ozone.test.tag.Flaky;
5251
import org.apache.ratis.server.protocol.TermIndex;
5352
import org.junit.jupiter.api.AfterEach;
5453
import org.junit.jupiter.api.Assertions;
@@ -67,6 +66,7 @@
6766
import java.util.UUID;
6867
import java.util.concurrent.TimeUnit;
6968
import java.util.concurrent.TimeoutException;
69+
import java.util.concurrent.atomic.AtomicInteger;
7070
import java.util.concurrent.atomic.AtomicLong;
7171
import java.util.concurrent.atomic.AtomicReference;
7272

@@ -86,23 +86,21 @@
8686
* Tests snapshot background services.
8787
*/
8888
@Timeout(5000)
89-
@Flaky("HDDS-9455")
9089
public class TestSnapshotBackgroundServices {
91-
92-
private MiniOzoneHAClusterImpl cluster = null;
90+
private MiniOzoneHAClusterImpl cluster;
9391
private ObjectStore objectStore;
9492
private OzoneBucket ozoneBucket;
9593
private String volumeName;
9694
private String bucketName;
9795

9896
private static final long SNAPSHOT_THRESHOLD = 50;
9997
private static final int LOG_PURGE_GAP = 50;
100-
// This test depends on direct RocksDB checks that are easier done with OBS
101-
// buckets.
102-
private static final BucketLayout TEST_BUCKET_LAYOUT =
103-
BucketLayout.OBJECT_STORE;
104-
private static final String SNAPSHOT_NAME_PREFIX = "snapshot";
98+
// This test depends on direct RocksDB checks that are easier done with OBS buckets.
99+
private static final BucketLayout TEST_BUCKET_LAYOUT = BucketLayout.OBJECT_STORE;
100+
private static final String SNAPSHOT_NAME_PREFIX = "snapshot-";
101+
private static final String KEY_NAME_PREFIX = "key-";
105102
private OzoneClient client;
103+
private final AtomicInteger counter = new AtomicInteger();
106104

107105
/**
108106
* Create a MiniOzoneCluster for testing. The cluster initially has one
@@ -115,11 +113,12 @@ public void init(TestInfo testInfo) throws Exception {
115113
String clusterId = UUID.randomUUID().toString();
116114
String scmId = UUID.randomUUID().toString();
117115
String omServiceId = "om-service-test1";
116+
OzoneManagerRatisServerConfig omRatisConf = conf.getObject(OzoneManagerRatisServerConfig.class);
117+
omRatisConf.setLogAppenderWaitTimeMin(10);
118+
conf.setFromObject(omRatisConf);
118119
conf.setInt(OMConfigKeys.OZONE_OM_RATIS_LOG_PURGE_GAP, LOG_PURGE_GAP);
119-
conf.setStorageSize(OMConfigKeys.OZONE_OM_RATIS_SEGMENT_SIZE_KEY, 16,
120-
StorageUnit.KB);
121-
conf.setStorageSize(OMConfigKeys.
122-
OZONE_OM_RATIS_SEGMENT_PREALLOCATED_SIZE_KEY, 16, StorageUnit.KB);
120+
conf.setStorageSize(OMConfigKeys.OZONE_OM_RATIS_SEGMENT_SIZE_KEY, 16, StorageUnit.KB);
121+
conf.setStorageSize(OMConfigKeys.OZONE_OM_RATIS_SEGMENT_PREALLOCATED_SIZE_KEY, 16, StorageUnit.KB);
123122
if ("testSSTFilteringBackgroundService".equals(testInfo.getDisplayName())) {
124123
conf.setTimeDuration(OZONE_SNAPSHOT_SST_FILTERING_SERVICE_INTERVAL, 1,
125124
TimeUnit.SECONDS);
@@ -174,12 +173,12 @@ public void init(TestInfo testInfo) throws Exception {
174173
client = OzoneClientFactory.getRpcClient(omServiceId, conf);
175174
objectStore = client.getObjectStore();
176175

177-
volumeName = "volume" + RandomStringUtils.randomNumeric(5);
178-
bucketName = "bucket" + RandomStringUtils.randomNumeric(5);
176+
volumeName = "volume" + counter.incrementAndGet();
177+
bucketName = "bucket" + counter.incrementAndGet();
179178

180179
VolumeArgs createVolumeArgs = VolumeArgs.newBuilder()
181-
.setOwner("user" + RandomStringUtils.randomNumeric(5))
182-
.setAdmin("admin" + RandomStringUtils.randomNumeric(5))
180+
.setOwner("user" + counter.incrementAndGet())
181+
.setAdmin("admin" + counter.incrementAndGet())
183182
.build();
184183

185184
objectStore.createVolume(volumeName, createVolumeArgs);
@@ -224,8 +223,7 @@ public void testSnapshotAndKeyDeletionBackgroundServices()
224223
cluster.getOzoneManager(leaderOM.getOMNodeId());
225224
Assertions.assertEquals(leaderOM, newFollowerOM);
226225

227-
SnapshotInfo newSnapshot = createOzoneSnapshot(newLeaderOM,
228-
SNAPSHOT_NAME_PREFIX + RandomStringUtils.randomNumeric(5));
226+
SnapshotInfo newSnapshot = createOzoneSnapshot(newLeaderOM, SNAPSHOT_NAME_PREFIX + counter.incrementAndGet());
229227

230228
/*
231229
Check whether newly created key data is reclaimed
@@ -250,8 +248,7 @@ public void testSnapshotAndKeyDeletionBackgroundServices()
250248
Assertions.assertNotNull(keyInfoA);
251249

252250
// create snapshot b
253-
SnapshotInfo snapshotInfoB = createOzoneSnapshot(newLeaderOM,
254-
SNAPSHOT_NAME_PREFIX + RandomStringUtils.randomNumeric(5));
251+
SnapshotInfo snapshotInfoB = createOzoneSnapshot(newLeaderOM, SNAPSHOT_NAME_PREFIX + counter.incrementAndGet());
255252
Assertions.assertNotNull(snapshotInfoB);
256253

257254
// delete key a
@@ -261,8 +258,7 @@ public void testSnapshotAndKeyDeletionBackgroundServices()
261258
() -> !isKeyInTable(keyA, omKeyInfoTable));
262259

263260
// create snapshot c
264-
SnapshotInfo snapshotInfoC = createOzoneSnapshot(newLeaderOM,
265-
SNAPSHOT_NAME_PREFIX + RandomStringUtils.randomNumeric(5));
261+
SnapshotInfo snapshotInfoC = createOzoneSnapshot(newLeaderOM, SNAPSHOT_NAME_PREFIX + counter.incrementAndGet());
266262

267263
// get snapshot c
268264
OmSnapshot snapC;
@@ -279,8 +275,7 @@ public void testSnapshotAndKeyDeletionBackgroundServices()
279275
() -> isKeyInTable(keyA, snapC.getMetadataManager().getDeletedTable()));
280276

281277
// create snapshot d
282-
SnapshotInfo snapshotInfoD = createOzoneSnapshot(newLeaderOM,
283-
SNAPSHOT_NAME_PREFIX + RandomStringUtils.randomNumeric(5));
278+
SnapshotInfo snapshotInfoD = createOzoneSnapshot(newLeaderOM, SNAPSHOT_NAME_PREFIX + counter.incrementAndGet());
284279

285280
// delete snapshot c
286281
client.getObjectStore()
@@ -533,18 +528,14 @@ public void testSSTFilteringBackgroundService()
533528
private void confirmSnapDiffForTwoSnapshotsDifferingBySingleKey(
534529
OzoneManager ozoneManager)
535530
throws IOException, InterruptedException, TimeoutException {
536-
String firstSnapshot = createOzoneSnapshot(ozoneManager,
537-
TestSnapshotBackgroundServices.SNAPSHOT_NAME_PREFIX +
538-
RandomStringUtils.randomNumeric(10)).getName();
531+
String firstSnapshot = createOzoneSnapshot(ozoneManager, SNAPSHOT_NAME_PREFIX + counter.incrementAndGet())
532+
.getName();
539533
String diffKey = writeKeys(1).get(0);
540-
String secondSnapshot = createOzoneSnapshot(ozoneManager,
541-
TestSnapshotBackgroundServices.SNAPSHOT_NAME_PREFIX +
542-
RandomStringUtils.randomNumeric(10)).getName();
543-
SnapshotDiffReportOzone diff = getSnapDiffReport(volumeName, bucketName,
544-
firstSnapshot, secondSnapshot);
534+
String secondSnapshot = createOzoneSnapshot(ozoneManager, SNAPSHOT_NAME_PREFIX + counter.incrementAndGet())
535+
.getName();
536+
SnapshotDiffReportOzone diff = getSnapDiffReport(volumeName, bucketName, firstSnapshot, secondSnapshot);
545537
Assertions.assertEquals(Collections.singletonList(
546-
SnapshotDiffReportOzone.getDiffReportEntry(
547-
SnapshotDiffReport.DiffType.CREATE, diffKey, null)),
538+
SnapshotDiffReportOzone.getDiffReportEntry(SnapshotDiffReport.DiffType.CREATE, diffKey, null)),
548539
diff.getDiffList());
549540
}
550541

@@ -574,9 +565,7 @@ private static File getSstBackupDir(OzoneManager ozoneManager) {
574565
private void checkIfSnapshotGetsProcessedBySFS(OzoneManager ozoneManager)
575566
throws IOException, TimeoutException, InterruptedException {
576567
writeKeys(1);
577-
SnapshotInfo newSnapshot = createOzoneSnapshot(ozoneManager,
578-
TestSnapshotBackgroundServices.SNAPSHOT_NAME_PREFIX +
579-
RandomStringUtils.randomNumeric(5));
568+
SnapshotInfo newSnapshot = createOzoneSnapshot(ozoneManager, SNAPSHOT_NAME_PREFIX + counter.incrementAndGet());
580569
Assertions.assertNotNull(newSnapshot);
581570
Table<String, SnapshotInfo> snapshotInfoTable =
582571
ozoneManager.getMetadataManager().getSnapshotInfoTable();
@@ -640,22 +629,17 @@ private SnapshotDiffReportOzone getSnapDiffReport(String volume,
640629
return response.get().getSnapshotDiffReport();
641630
}
642631

643-
private SnapshotInfo createOzoneSnapshot(OzoneManager leaderOM, String name)
644-
throws IOException {
632+
private SnapshotInfo createOzoneSnapshot(OzoneManager leaderOM, String name) throws IOException {
645633
objectStore.createSnapshot(volumeName, bucketName, name);
646634

647-
String tableKey = SnapshotInfo.getTableKey(volumeName,
648-
bucketName,
649-
name);
635+
String tableKey = SnapshotInfo.getTableKey(volumeName, bucketName, name);
650636
SnapshotInfo snapshotInfo = leaderOM.getMetadataManager()
651637
.getSnapshotInfoTable()
652638
.get(tableKey);
653639
// Allow the snapshot to be written to disk
654-
String fileName =
655-
getSnapshotPath(leaderOM.getConfiguration(), snapshotInfo);
640+
String fileName = getSnapshotPath(leaderOM.getConfiguration(), snapshotInfo);
656641
File snapshotDir = new File(fileName);
657-
if (!RDBCheckpointUtils
658-
.waitForCheckpointDirectoryExist(snapshotDir)) {
642+
if (!RDBCheckpointUtils.waitForCheckpointDirectoryExist(snapshotDir)) {
659643
throw new IOException("snapshot directory doesn't exist");
660644
}
661645
return snapshotInfo;
@@ -665,7 +649,9 @@ private List<String> writeKeys(long keyCount) throws IOException {
665649
List<String> keys = new ArrayList<>();
666650
long index = 0;
667651
while (index < keyCount) {
668-
keys.add(createKey(ozoneBucket));
652+
String key = KEY_NAME_PREFIX + counter.incrementAndGet();
653+
createKey(ozoneBucket, key);
654+
keys.add(key);
669655
index++;
670656
}
671657
return keys;
@@ -679,5 +665,4 @@ private void readKeys(List<String> keys) throws IOException {
679665
inputStream.close();
680666
}
681667
}
682-
683668
}

hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/OMDBCheckpointServlet.java

Lines changed: 4 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -242,9 +242,9 @@ public DBCheckpoint getCheckpoint(Path tmpdir, boolean flush)
242242
long startTime = System.currentTimeMillis();
243243
long pauseCounter = PAUSE_COUNTER.incrementAndGet();
244244

245-
// Pause compactions, Copy/link files and get checkpoint.
246245
try {
247246
LOG.info("Compaction pausing {} started.", pauseCounter);
247+
// Pause compactions, Copy/link files and get checkpoint.
248248
differ.incrementTarballRequestCount();
249249
FileUtils.copyDirectory(compactionLogDir.getOriginalDir(),
250250
compactionLogDir.getTmpDir());
@@ -253,13 +253,9 @@ public DBCheckpoint getCheckpoint(Path tmpdir, boolean flush)
253253
checkpoint = getDbStore().getCheckpoint(flush);
254254
} finally {
255255
// Unpause the compaction threads.
256-
synchronized (getDbStore().getRocksDBCheckpointDiffer()) {
257-
differ.decrementTarballRequestCount();
258-
differ.notifyAll();
259-
long elapsedTime = System.currentTimeMillis() - startTime;
260-
LOG.info("Compaction pausing {} ended. Elapsed ms: {}",
261-
pauseCounter, elapsedTime);
262-
}
256+
differ.decrementTarballRequestCountAndNotify();
257+
long elapsedTime = System.currentTimeMillis() - startTime;
258+
LOG.info("Compaction pausing {} ended. Elapsed ms: {}", pauseCounter, elapsedTime);
263259
}
264260
return checkpoint;
265261
}

0 commit comments

Comments
 (0)