-
Notifications
You must be signed in to change notification settings - Fork 29k
[SPARK-21660][YARN][Shuffle] Yarn ShuffleService failed to start when the chosen dir… #18905
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Closed
Closed
Changes from all commits
Commits
Show all changes
5 commits
Select commit
Hold shift + click to select a range
d62405d
[SPARK-21660] Yarn ShuffleService failed to start when the chosen dir…
LiShuMing 2077537
Recovery path had already existed but unavailable, set it to null
LiShuMing 6841ca4
format code and add unit test
2e06fdb
default file access should not contain
LiShuMing e380c6f
should check recovery path access instead of db file
LiShuMing File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -32,6 +32,7 @@ | |
| import com.google.common.collect.Lists; | ||
| import org.apache.hadoop.conf.Configuration; | ||
| import org.apache.hadoop.fs.FileSystem; | ||
| import org.apache.hadoop.fs.FileUtil; | ||
| import org.apache.hadoop.fs.Path; | ||
| import org.apache.hadoop.fs.permission.FsPermission; | ||
| import org.apache.hadoop.yarn.api.records.ContainerId; | ||
|
|
@@ -332,28 +333,76 @@ protected Path getRecoveryPath(String fileName) { | |
| return _recoveryPath; | ||
| } | ||
|
|
||
| /** | ||
| * Checks that the current running process can read, write the given file | ||
| * by using methods of the File objects. | ||
| * | ||
| * @param file File to check | ||
| * @return True if process has read, write and execute access on the path, or false. | ||
| */ | ||
| protected Boolean checkFileAccess(File file) { | ||
| if (!file.exists()) { | ||
| logger.warn("File is not existed: " + file.toString()); | ||
| return false; | ||
| } | ||
|
|
||
| if (!FileUtil.canRead(file)) { | ||
| logger.warn("File is not readable: " + file.toString()); | ||
| return false; | ||
| } | ||
|
|
||
| if (!FileUtil.canWrite(file)) { | ||
| logger.warn("File is not writable: " + file.toString()); | ||
| return false; | ||
| } | ||
|
|
||
| if (!FileUtil.canExecute(file)) { | ||
| logger.warn("File is not executable: " + file.toString()); | ||
| return false; | ||
| } | ||
|
|
||
| return true; | ||
| } | ||
|
|
||
| /** | ||
| * Figure out the recovery path and handle moving the DB if YARN NM recovery gets enabled | ||
| * when it previously was not. If YARN NM recovery is enabled it uses that path, otherwise | ||
| * it will uses a YARN local dir. | ||
| */ | ||
| protected File initRecoveryDb(String dbName) { | ||
| protected File initRecoveryDb(String dbName) throws IOException { | ||
| Boolean bolRecoveryPathAvailable = true; | ||
|
|
||
| if (_recoveryPath != null) { | ||
| File recoveryFile = new File(_recoveryPath.toUri().getPath(), dbName); | ||
| if (recoveryFile.exists()) { | ||
|
|
||
| bolRecoveryPathAvailable = checkFileAccess(new File(_recoveryPath.toUri().getPath())); | ||
| logger.info("Recovery path {} ldb available: {}.", _recoveryPath, bolRecoveryPathAvailable); | ||
|
|
||
| if (recoveryFile.exists() && bolRecoveryPathAvailable) { | ||
| return recoveryFile; | ||
| } | ||
| } | ||
|
|
||
| // If recovery path unavailable, no use it any more. | ||
| if (!bolRecoveryPathAvailable) { | ||
| logger.warn("Recovery path {} unavailable: set it to null", _recoveryPath); | ||
| _recoveryPath = null; | ||
| } | ||
|
|
||
| // db doesn't exist in recovery path go check local dirs for it | ||
| String[] localDirs = _conf.getTrimmedStrings("yarn.nodemanager.local-dirs"); | ||
| for (String dir : localDirs) { | ||
| File f = new File(new Path(dir).toUri().getPath(), dbName); | ||
| // 1. `_recoveryPath` not exists, `f` should be writable; | ||
| // 2. `_recoveryPath` exists, `newLoc` should be writable; | ||
| if (f.exists()) { | ||
| if (_recoveryPath == null) { | ||
| // If NM recovery is not enabled, we should specify the recovery path using NM local | ||
| // dirs, which is compatible with the old code. | ||
| _recoveryPath = new Path(dir); | ||
| return f; | ||
| if (checkFileAccess(f)) { | ||
| // If NM recovery is not enabled, we should specify the recovery path using NM local | ||
| // dirs, which is compatible with the old code. | ||
| _recoveryPath = new Path(dir); | ||
| return f; | ||
| } | ||
| } else { | ||
| // If the recovery path is set then either NM recovery is enabled or another recovery | ||
| // DB has been initialized. If NM recovery is enabled and had set the recovery path | ||
|
|
@@ -378,8 +427,24 @@ protected File initRecoveryDb(String dbName) { | |
| } | ||
| } | ||
| } | ||
|
|
||
| // Find a local_dir which is writable, to avoid creating ldb in a read-only disk. | ||
| if (_recoveryPath == null) { | ||
| for (String dir : localDirs) { | ||
| File f = new File(dir); | ||
| if (checkFileAccess(f)) { | ||
| _recoveryPath = new Path(dir); | ||
| break; | ||
| } else { | ||
| logger.warn("Local dir {} is not reachable.", dir); | ||
| } | ||
| } | ||
| } | ||
|
|
||
| if (_recoveryPath == null) { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If |
||
| _recoveryPath = new Path(localDirs[0]); | ||
| throw new IOException("Failed to choose a reachable DB recovery path, " + | ||
| "please check `yarn.nodemanager.local-dirs` and `yarn.nodemanager.recovery.dir` is available " + | ||
| "in the `yarn-site.xml`."); | ||
| } | ||
|
|
||
| return new File(_recoveryPath.toUri().getPath(), dbName); | ||
|
|
||
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think recovery path is set by user or use yarn default, user should make sure the availability of this directory, and yarn internally relies on it. It doesn't make sense to change to another disk if recovery path is unavailable.