-
Notifications
You must be signed in to change notification settings - Fork 1.5k
PARQUET-674: Add InputFile abstraction for openable files. #368
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 1 commit
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,48 @@ | ||
| /* | ||
| * Licensed to the Apache Software Foundation (ASF) under one | ||
| * or more contributor license agreements. See the NOTICE file | ||
| * distributed with this work for additional information | ||
| * regarding copyright ownership. The ASF licenses this file | ||
| * to you under the Apache License, Version 2.0 (the | ||
| * "License"); you may not use this file except in compliance | ||
| * with the License. You may obtain a copy of the License at | ||
| * | ||
| * http://www.apache.org/licenses/LICENSE-2.0 | ||
| * | ||
| * Unless required by applicable law or agreed to in writing, | ||
| * software distributed under the License is distributed on an | ||
| * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | ||
| * KIND, either express or implied. See the License for the | ||
| * specific language governing permissions and limitations | ||
| * under the License. | ||
| */ | ||
|
|
||
| package org.apache.parquet.io; | ||
|
|
||
| import java.io.IOException; | ||
|
|
||
| /** | ||
| * {@code ParquetDataSource} is an interface with the methods needed by Parquet | ||
| * to read data files using {@link SeekableInputStream} instances. | ||
| */ | ||
| public interface ParquetDataSource { | ||
|
|
||
| /** | ||
| * Returns the file location. | ||
| */ | ||
| String getLocation(); | ||
|
||
|
|
||
| /** | ||
| * Returns the total length of the file, in bytes. | ||
| * @throws IOException if the length cannot be determined | ||
| */ | ||
| long getLength() throws IOException; | ||
|
|
||
| /** | ||
| * Opens a new {@link SeekableInputStream} for the underlying | ||
| * data file. | ||
| * @throws IOException if the stream cannot be opened. | ||
| */ | ||
| SeekableInputStream newStream() throws IOException; | ||
|
|
||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,79 @@ | ||
| /* | ||
| * Licensed to the Apache Software Foundation (ASF) under one | ||
| * or more contributor license agreements. See the NOTICE file | ||
| * distributed with this work for additional information | ||
| * regarding copyright ownership. The ASF licenses this file | ||
| * to you under the Apache License, Version 2.0 (the | ||
| * "License"); you may not use this file except in compliance | ||
| * with the License. You may obtain a copy of the License at | ||
| * | ||
| * http://www.apache.org/licenses/LICENSE-2.0 | ||
| * | ||
| * Unless required by applicable law or agreed to in writing, | ||
| * software distributed under the License is distributed on an | ||
| * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | ||
| * KIND, either express or implied. See the License for the | ||
| * specific language governing permissions and limitations | ||
| * under the License. | ||
| */ | ||
|
|
||
| package org.apache.parquet.hadoop.util; | ||
|
|
||
| import org.apache.hadoop.conf.Configurable; | ||
| import org.apache.hadoop.conf.Configuration; | ||
| import org.apache.hadoop.fs.FileStatus; | ||
| import org.apache.hadoop.fs.FileSystem; | ||
| import org.apache.hadoop.fs.Path; | ||
| import org.apache.parquet.io.SeekableInputStream; | ||
| import org.apache.parquet.io.ParquetDataSource; | ||
| import java.io.IOException; | ||
|
|
||
| public class HadoopDataSource implements ParquetDataSource, Configurable { | ||
|
||
|
|
||
| private final FileSystem fs; | ||
| private final FileStatus stat; | ||
| private Configuration conf; | ||
|
|
||
| public static HadoopDataSource fromPath(Path path, Configuration conf) | ||
| throws IOException { | ||
| FileSystem fs = path.getFileSystem(conf); | ||
| return new HadoopDataSource(fs, fs.getFileStatus(path), conf); | ||
| } | ||
|
|
||
| public static HadoopDataSource fromStatus(FileStatus stat, Configuration conf) | ||
| throws IOException { | ||
| FileSystem fs = stat.getPath().getFileSystem(conf); | ||
| return new HadoopDataSource(fs, stat, conf); | ||
| } | ||
|
|
||
| private HadoopDataSource(FileSystem fs, FileStatus stat, Configuration conf) { | ||
| this.conf = conf; | ||
| this.fs = fs; | ||
| this.stat = stat; | ||
| } | ||
|
|
||
| @Override | ||
| public String getLocation() { | ||
| return stat.getPath().toString(); | ||
| } | ||
|
|
||
| @Override | ||
| public long getLength() { | ||
| return stat.getLen(); | ||
| } | ||
|
|
||
| @Override | ||
| public SeekableInputStream newStream() throws IOException { | ||
| return HadoopStreams.wrap(fs.open(stat.getPath())); | ||
| } | ||
|
|
||
| @Override | ||
| public void setConf(Configuration conf) { | ||
| this.conf = conf; | ||
| } | ||
|
|
||
| @Override | ||
| public Configuration getConf() { | ||
| return conf; | ||
| } | ||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It's a SeekableInputStream provider with a length.
maybe call it InputFile ?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yeah, I wasn't too happy with the name either. InputFile is something I hadn't though of and sounds pretty good. I'll go with that.