-
Notifications
You must be signed in to change notification settings - Fork 2.5k
[HUDI-1496] Fixing detection of GCS FileSystem #2500
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
463fc92
1d5fb46
d86ef4a
3987af2
f78ed1f
cfd622a
2da574f
5af6449
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,52 @@ | ||
| /* | ||
| * Licensed to the Apache Software Foundation (ASF) under one | ||
| * or more contributor license agreements. See the NOTICE file | ||
| * distributed with this work for additional information | ||
| * regarding copyright ownership. The ASF licenses this file | ||
| * to you under the Apache License, Version 2.0 (the | ||
| * "License"); you may not use this file except in compliance | ||
| * with the License. You may obtain a copy of the License at | ||
| * | ||
| * http://www.apache.org/licenses/LICENSE-2.0 | ||
| * | ||
| * Unless required by applicable law or agreed to in writing, software | ||
| * distributed under the License is distributed on an "AS IS" BASIS, | ||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
| * See the License for the specific language governing permissions and | ||
| * limitations under the License. | ||
| */ | ||
|
|
||
| package org.apache.hudi.common.fs; | ||
|
|
||
| import org.apache.hadoop.fs.FSDataInputStream; | ||
|
|
||
| import java.io.EOFException; | ||
| import java.io.IOException; | ||
| import java.io.InputStream; | ||
|
|
||
| /** | ||
| * Scheme aware FSDataInputStream so that we manipulate seeks for GS filesystem. | ||
| */ | ||
| public class SchemeAwareFSDataInputStream extends FSDataInputStream { | ||
|
|
||
| private final boolean isGCSFileSystem; | ||
|
|
||
| public SchemeAwareFSDataInputStream(InputStream in, boolean isGCSFileSystem) { | ||
| super(in); | ||
| this.isGCSFileSystem = isGCSFileSystem; | ||
| } | ||
|
|
||
| @Override | ||
| public void seek(long desired) throws IOException { | ||
| try { | ||
| super.seek(desired); | ||
| } catch (EOFException e) { | ||
| // with GCSFileSystem, accessing the last byte might throw EOFException and hence this fix. | ||
| if (isGCSFileSystem) { | ||
| super.seek(desired - 1); | ||
| } else { | ||
| throw e; | ||
| } | ||
| } | ||
| } | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -19,6 +19,7 @@ | |
| package org.apache.hudi.common.table.log; | ||
|
|
||
| import org.apache.hudi.common.fs.FSUtils; | ||
| import org.apache.hudi.common.fs.SchemeAwareFSDataInputStream; | ||
| import org.apache.hudi.common.fs.TimedFSDataInputStream; | ||
| import org.apache.hudi.common.model.HoodieLogFile; | ||
| import org.apache.hudi.common.table.log.block.HoodieAvroDataBlock; | ||
|
|
@@ -75,20 +76,8 @@ public class HoodieLogFileReader implements HoodieLogFormat.Reader { | |
| public HoodieLogFileReader(FileSystem fs, HoodieLogFile logFile, Schema readerSchema, int bufferSize, | ||
| boolean readBlockLazily, boolean reverseReader) throws IOException { | ||
| FSDataInputStream fsDataInputStream = fs.open(logFile.getPath(), bufferSize); | ||
| if (FSUtils.isGCSInputStream(fsDataInputStream)) { | ||
| this.inputStream = new TimedFSDataInputStream(logFile.getPath(), new FSDataInputStream( | ||
| new BufferedFSInputStream((FSInputStream) (( | ||
| (FSDataInputStream) fsDataInputStream.getWrappedStream()).getWrappedStream()), bufferSize))); | ||
| } else if (fsDataInputStream.getWrappedStream() instanceof FSInputStream) { | ||
| this.inputStream = new TimedFSDataInputStream(logFile.getPath(), new FSDataInputStream( | ||
| new BufferedFSInputStream((FSInputStream) fsDataInputStream.getWrappedStream(), bufferSize))); | ||
| } else { | ||
| // fsDataInputStream.getWrappedStream() maybe a BufferedFSInputStream | ||
| // need to wrap in another BufferedFSInputStream the make bufferSize work? | ||
| this.inputStream = fsDataInputStream; | ||
| } | ||
|
|
||
| this.logFile = logFile; | ||
| this.inputStream = getFSDataInputStream(fsDataInputStream, fs, bufferSize); | ||
| this.readerSchema = readerSchema; | ||
| this.readBlockLazily = readBlockLazily; | ||
| this.reverseReader = reverseReader; | ||
|
|
@@ -107,6 +96,56 @@ public HoodieLogFileReader(FileSystem fs, HoodieLogFile logFile, Schema readerSc | |
| this(fs, logFile, readerSchema, DEFAULT_BUFFER_SIZE, false, false); | ||
| } | ||
|
|
||
| /** | ||
| * Fetch the right {@link FSDataInputStream} to be used by wrapping with required input streams. | ||
| * @param fsDataInputStream original instance of {@link FSDataInputStream}. | ||
| * @param fs instance of {@link FileSystem} in use. | ||
| * @param bufferSize buffer size to be used. | ||
| * @return the right {@link FSDataInputStream} as required. | ||
| */ | ||
| private FSDataInputStream getFSDataInputStream(FSDataInputStream fsDataInputStream, FileSystem fs, int bufferSize) { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I would rather rewrite it like this reducing cyclomatic complexity, but I am also fine with what is here originally: private FSDataInputStream getFSDataInputStream(FSDataInputStream fsDataInputStream, FileSystem fs, int bufferSize) {
if (FSUtils.isGCSFileSystem(fs)) {
return new SchemeAwareFSDataInputStream(
getFSDataInputStreamForGCSFs(fsDataInputStream, fs, bufferSize), true);
}
if (fsDataInputStream.getWrappedStream() instanceof FSInputStream) {
return new TimedFSDataInputStream(logFile.getPath(), new FSDataInputStream(
new BufferedFSInputStream((FSInputStream) fsDataInputStream.getWrappedStream(), bufferSize)));
}
return fsDataInputStream;
}
private FSDataInputStream getFSDataInputStreamForGCSFs(FSDataInputStream fsDataInputStream, FileSystem fs, int bufferSize) {
if (fsDataInputStream.getWrappedStream() instanceof FSInputStream) {
return new TimedFSDataInputStream(logFile.getPath(), new FSDataInputStream(
new BufferedFSInputStream((FSInputStream) fsDataInputStream.getWrappedStream(), bufferSize)));
}
if (fsDataInputStream.getWrappedStream() instanceof FSDataInputStream
&& ((FSDataInputStream) fsDataInputStream.getWrappedStream()).getWrappedStream() instanceof FSInputStream) {
FSInputStream inputStream = (FSInputStream)((FSDataInputStream) fsDataInputStream.getWrappedStream()).getWrappedStream();
return new TimedFSDataInputStream(logFile.getPath(), new FSDataInputStream(
new BufferedFSInputStream(inputStream, bufferSize)));
}
return fsDataInputSt
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I also agree with you. will restructure this a bit. |
||
| if (FSUtils.isGCSFileSystem(fs)) { | ||
| // in GCS FS, we might need to interceptor seek offsets as we might get EOF exception | ||
| return new SchemeAwareFSDataInputStream(getFSDataInputStreamForGCS(fsDataInputStream, bufferSize), true); | ||
| } | ||
|
|
||
| if (fsDataInputStream.getWrappedStream() instanceof FSInputStream) { | ||
| return new TimedFSDataInputStream(logFile.getPath(), new FSDataInputStream( | ||
| new BufferedFSInputStream((FSInputStream) fsDataInputStream.getWrappedStream(), bufferSize))); | ||
| } | ||
|
|
||
| // fsDataInputStream.getWrappedStream() maybe a BufferedFSInputStream | ||
| // need to wrap in another BufferedFSInputStream the make bufferSize work? | ||
| return fsDataInputStream; | ||
| } | ||
|
|
||
| /** | ||
| * GCS FileSystem needs some special handling for seek and hence this method assists to fetch the right {@link FSDataInputStream} to be | ||
| * used by wrapping with required input streams. | ||
| * @param fsDataInputStream original instance of {@link FSDataInputStream}. | ||
| * @param bufferSize buffer size to be used. | ||
| * @return the right {@link FSDataInputStream} as required. | ||
| */ | ||
| private FSDataInputStream getFSDataInputStreamForGCS(FSDataInputStream fsDataInputStream, int bufferSize) { | ||
| // incase of GCS FS, there are two flows. | ||
| // a. fsDataInputStream.getWrappedStream() instanceof FSInputStream | ||
| // b. fsDataInputStream.getWrappedStream() not an instanceof FSInputStream, but an instance of FSDataInputStream. | ||
| // (a) is handled in the first if block and (b) is handled in the second if block. If not, we fallback to original fsDataInputStream | ||
| if (fsDataInputStream.getWrappedStream() instanceof FSInputStream) { | ||
| return new TimedFSDataInputStream(logFile.getPath(), new FSDataInputStream( | ||
| new BufferedFSInputStream((FSInputStream) fsDataInputStream.getWrappedStream(), bufferSize))); | ||
| } | ||
|
|
||
| if (fsDataInputStream.getWrappedStream() instanceof FSDataInputStream | ||
| && ((FSDataInputStream) fsDataInputStream.getWrappedStream()).getWrappedStream() instanceof FSInputStream) { | ||
| FSInputStream inputStream = (FSInputStream)((FSDataInputStream) fsDataInputStream.getWrappedStream()).getWrappedStream(); | ||
| return new TimedFSDataInputStream(logFile.getPath(), | ||
| new FSDataInputStream(new BufferedFSInputStream(inputStream, bufferSize))); | ||
| } | ||
|
|
||
| return fsDataInputStream; | ||
| } | ||
|
|
||
| @Override | ||
| public HoodieLogFile getLogFile() { | ||
| return logFile; | ||
|
|
@@ -238,11 +277,7 @@ private HoodieLogBlock createCorruptBlock() throws IOException { | |
| private boolean isBlockCorrupt(int blocksize) throws IOException { | ||
| long currentPos = inputStream.getPos(); | ||
| try { | ||
| if (FSUtils.isGCSInputStream(inputStream)) { | ||
| inputStream.seek(currentPos + blocksize - 1); | ||
| } else { | ||
| inputStream.seek(currentPos + blocksize); | ||
| } | ||
| inputStream.seek(currentPos + blocksize); | ||
| } catch (EOFException e) { | ||
| LOG.info("Found corrupted block in file " + logFile + " with block size(" + blocksize + ") running past EOF"); | ||
| // this is corrupt | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.