-
Notifications
You must be signed in to change notification settings - Fork 587
HDDS-10230. Preventing V3 Schema from Creating Container DB in the Wrong Location #6113
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 2 commits
1a3ea17
f42c4f4
148cd24
fade60c
0344edf
9b344dd
a736e71
564c710
30c978b
32cd826
92fe762
2205d39
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -92,6 +92,7 @@ public class HddsVolume extends StorageVolume { | |
| private File dbParentDir; | ||
| private File deletedContainerDir; | ||
| private AtomicBoolean dbLoaded = new AtomicBoolean(false); | ||
| private final AtomicBoolean dbLoadFailure = new AtomicBoolean(false); | ||
|
|
||
| /** | ||
| * Builder for HddsVolume. | ||
|
|
@@ -257,14 +258,23 @@ public synchronized VolumeCheckResult check(@Nullable Boolean unused) | |
| VolumeCheckResult result = super.check(unused); | ||
|
|
||
| DatanodeConfiguration df = getConf().getObject(DatanodeConfiguration.class); | ||
| if (isDbLoadFailure()) { | ||
| LOG.warn("Volume {} failed to access RocksDB: RocksDB parent directory is null, " + | ||
| "the volume might not have been loaded properly.", getStorageDir()); | ||
| return VolumeCheckResult.FAILED; | ||
| } | ||
| if (result != VolumeCheckResult.HEALTHY || | ||
| !df.getContainerSchemaV3Enabled() || !isDbLoaded()) { | ||
| return result; | ||
| } | ||
|
|
||
| // Check that per-volume RocksDB is present. | ||
| File dbFile = new File(dbParentDir, CONTAINER_DB_NAME); | ||
| if (!dbFile.exists() || !dbFile.canRead()) { | ||
| File dbFile = dbParentDir == null ? null : new File(dbParentDir, CONTAINER_DB_NAME); | ||
| if (dbFile == null || !dbFile.exists() || !dbFile.canRead()) { | ||
| if (dbFile == null) { | ||
| LOG.warn("Volume {} failed to access RocksDB: RocksDB parent directory is null, " + | ||
| "the volume might not have been loaded properly.", getStorageDir()); | ||
| } | ||
| LOG.warn("Volume {} failed health check. Could not access RocksDB at " + | ||
| "{}", getStorageDir(), dbFile); | ||
| return VolumeCheckResult.FAILED; | ||
|
|
@@ -326,6 +336,10 @@ public boolean isDbLoaded() { | |
| return dbLoaded.get(); | ||
| } | ||
|
|
||
| public boolean isDbLoadFailure() { | ||
| return dbLoadFailure.get(); | ||
| } | ||
|
|
||
| public void loadDbStore(boolean readOnly) throws IOException { | ||
| // DN startup for the first time, not registered yet, | ||
| // so the DbVolume is not formatted. | ||
|
|
@@ -343,35 +357,41 @@ public void loadDbStore(boolean readOnly) throws IOException { | |
| File clusterIdDir = new File(dbVolume == null ? | ||
| getStorageDir() : dbVolume.getStorageDir(), | ||
| getClusterID()); | ||
| if (!clusterIdDir.exists()) { | ||
| throw new IOException("Working dir " + clusterIdDir.getAbsolutePath() + | ||
| " not created for HddsVolume: " + getStorageDir().getAbsolutePath()); | ||
| } | ||
| try { | ||
| if (!clusterIdDir.exists()) { | ||
| throw new IOException("Working dir " + clusterIdDir.getAbsolutePath() + | ||
| " not created for HddsVolume: " + getStorageDir().getAbsolutePath()); | ||
| } | ||
|
|
||
| File storageIdDir = new File(clusterIdDir, getStorageID()); | ||
| if (!storageIdDir.exists()) { | ||
| throw new IOException("Db parent dir " + storageIdDir.getAbsolutePath() + | ||
| " not found for HddsVolume: " + getStorageDir().getAbsolutePath()); | ||
| } | ||
| File storageIdDir = new File(clusterIdDir, getStorageID()); | ||
| if (!storageIdDir.exists()) { | ||
| throw new IOException("Db parent dir " + storageIdDir.getAbsolutePath() + | ||
| " not found for HddsVolume: " + getStorageDir().getAbsolutePath()); | ||
| } | ||
|
|
||
| File containerDBFile = new File(storageIdDir, CONTAINER_DB_NAME); | ||
| if (!containerDBFile.exists()) { | ||
| throw new IOException("Db dir " + storageIdDir.getAbsolutePath() + | ||
| " not found for HddsVolume: " + getStorageDir().getAbsolutePath()); | ||
| } | ||
| File containerDBFile = new File(storageIdDir, CONTAINER_DB_NAME); | ||
| if (!containerDBFile.exists()) { | ||
| throw new IOException("Db dir " + storageIdDir.getAbsolutePath() + | ||
| " not found for HddsVolume: " + getStorageDir().getAbsolutePath()); | ||
| } | ||
|
|
||
| String containerDBPath = containerDBFile.getAbsolutePath(); | ||
| try { | ||
| initPerDiskDBStore(containerDBPath, getConf(), readOnly); | ||
| String containerDBPath = containerDBFile.getAbsolutePath(); | ||
| try { | ||
| initPerDiskDBStore(containerDBPath, getConf(), readOnly); | ||
| } catch (IOException e) { | ||
| throw new IOException("Can't init db instance under path " | ||
| + containerDBPath + " for volume " + getStorageID(), e); | ||
| } | ||
|
|
||
| dbParentDir = storageIdDir; | ||
| dbLoaded.set(true); | ||
| dbLoadFailure.set(false); | ||
| LOG.info("SchemaV3 db is loaded at {} for volume {}", containerDBPath, | ||
| getStorageID()); | ||
| } catch (IOException e) { | ||
| throw new IOException("Can't init db instance under path " | ||
| + containerDBPath + " for volume " + getStorageID(), e); | ||
| dbLoadFailure.set(true); | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @xichen01 , thanks for reporting this. Can you share more detail about which operation will cause the DB load failure and throw the IOException, besides the known initPerDiskDBStore? Is there any exception stack can be shared?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think we need to root cause this issue. Could you kindly check the related DN logs to see if there is any related suspicious logs? If we don't have the root cause, then this extra IOException catch may not enough.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Not found others related log which can help to locate the root cause, this may be a legacy issue that may have existed when the cluster was created, but with this PR, we could have found this issue faster.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I have observed another impact when volume metadata path is
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @xichen01 , in case we don't know the exact case of DB loading failure, let's change this catch from IOException to Throwable.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @xichen01 , could you please address my above comment and provide a new patch?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. OK, I will |
||
| throw e; | ||
| } | ||
|
|
||
| dbParentDir = storageIdDir; | ||
| dbLoaded.set(true); | ||
| LOG.info("SchemaV3 db is loaded at {} for volume {}", containerDBPath, | ||
| getStorageID()); | ||
| } | ||
|
|
||
| /** | ||
|
|
@@ -417,9 +437,11 @@ public void createDbStore(MutableVolumeSet dbVolumeSet) throws IOException { | |
| try { | ||
| HddsVolumeUtil.initPerDiskDBStore(containerDBPath, getConf(), false); | ||
| dbLoaded.set(true); | ||
| dbLoadFailure.set(false); | ||
| LOG.info("SchemaV3 db is created and loaded at {} for volume {}", | ||
| containerDBPath, getStorageID()); | ||
| } catch (IOException e) { | ||
| dbLoadFailure.set(true); | ||
| String errMsg = "Can't create db instance under path " | ||
| + containerDBPath + " for volume " + getStorageID(); | ||
| LOG.error(errMsg, e); | ||
|
|
@@ -448,6 +470,7 @@ private void closeDbStore() { | |
| .getAbsolutePath(); | ||
| DatanodeStoreCache.getInstance().removeDB(containerDBPath); | ||
| dbLoaded.set(false); | ||
| dbLoadFailure.set(false); | ||
| LOG.info("SchemaV3 db is stopped at {} for volume {}", containerDBPath, | ||
| getStorageID()); | ||
| } | ||
|
|
||


There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
If it passes L261 check, does it still have the chance to have a null dbParentDir?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
dbParentDirwill non-null if it passes L261 check. I think this can be simplified