-
Notifications
You must be signed in to change notification settings - Fork 1.6k
PARQUET-1201: Column indexes #527
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 10 commits
aa571d7
6165a0c
1001994
dc645db
43ac3e1
d8e78eb
1f95eca
55d791c
85e699c
c215f1f
e551893
1206c60
5b55d87
6781c8d
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,166 @@ | ||
| /* | ||
| * Licensed to the Apache Software Foundation (ASF) under one | ||
| * or more contributor license agreements. See the NOTICE file | ||
| * distributed with this work for additional information | ||
| * regarding copyright ownership. The ASF licenses this file | ||
| * to you under the Apache License, Version 2.0 (the | ||
| * "License"); you may not use this file except in compliance | ||
| * with the License. You may obtain a copy of the License at | ||
| * | ||
| * http://www.apache.org/licenses/LICENSE-2.0 | ||
| * | ||
| * Unless required by applicable law or agreed to in writing, | ||
| * software distributed under the License is distributed on an | ||
| * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | ||
| * KIND, either express or implied. See the License for the | ||
| * specific language governing permissions and limitations | ||
| * under the License. | ||
| */ | ||
| package org.apache.parquet.cli.commands; | ||
|
|
||
| import java.io.IOException; | ||
| import java.util.AbstractMap; | ||
| import java.util.ArrayList; | ||
| import java.util.HashMap; | ||
| import java.util.List; | ||
| import java.util.Map; | ||
| import java.util.Map.Entry; | ||
|
|
||
| import org.apache.hadoop.conf.Configuration; | ||
| import org.apache.hadoop.fs.Path; | ||
| import org.apache.parquet.cli.BaseCommand; | ||
| import org.apache.parquet.hadoop.ParquetFileReader; | ||
| import org.apache.parquet.hadoop.metadata.BlockMetaData; | ||
| import org.apache.parquet.hadoop.metadata.ColumnChunkMetaData; | ||
| import org.apache.parquet.hadoop.metadata.ParquetMetadata; | ||
| import org.apache.parquet.hadoop.util.HadoopInputFile; | ||
| import org.apache.parquet.internal.column.columnindex.ColumnIndex; | ||
| import org.apache.parquet.internal.column.columnindex.OffsetIndex; | ||
| import org.apache.parquet.io.InputFile; | ||
| import org.slf4j.Logger; | ||
|
|
||
| import com.beust.jcommander.Parameter; | ||
| import com.beust.jcommander.Parameters; | ||
| import com.google.common.base.Preconditions; | ||
| import com.google.common.collect.Lists; | ||
|
|
||
| /** | ||
| * parquet-cli command to print column and offset indexes. | ||
| */ | ||
| @Parameters(commandDescription = "Prints the column and offset indexes of a Parquet file") | ||
| public class ShowColumnIndexCommand extends BaseCommand { | ||
| public ShowColumnIndexCommand(Logger console) { | ||
| super(console); | ||
| } | ||
|
|
||
| @Parameter(description = "<parquet path>") | ||
| List<String> files; | ||
|
|
||
| @Parameter(names = { "-c", "--column" }, description = "Shows the column/offset indexes for the given column only") | ||
| List<String> ColumnPaths; | ||
|
|
||
| @Parameter(names = { "-b", | ||
| "--block" }, description = "Shows the column/offset indexes for the given block (row-group) only; " | ||
| + "blocks are referenced by their indexes from 0") | ||
| List<String> blockIndexes; | ||
|
|
||
| @Parameter(names = { "-i", "--column-index" }, description = "Shows the column indexes; " | ||
| + "active by default unless -o is used") | ||
| boolean showColumnIndex; | ||
|
|
||
| @Parameter(names = { "-o", "--offset-index" }, description = "Shows the offset indexes; " | ||
| + "active by default unless -i is used") | ||
| boolean showOffsetIndex; | ||
|
|
||
| @Override | ||
| public List<String> getExamples() { | ||
| return Lists.newArrayList( | ||
| "# Show only column indexes for column 'col' from a Parquet file", | ||
| "-c col -i sample.parquet"); | ||
| } | ||
|
|
||
| @Override | ||
| public int run() throws IOException { | ||
| Preconditions.checkArgument(files != null && files.size() >= 1, | ||
| "A Parquet file is required."); | ||
| Preconditions.checkArgument(files.size() == 1, | ||
| "Cannot process multiple Parquet files."); | ||
|
|
||
| InputFile in = HadoopInputFile.fromPath(new Path(files.get(0)), new Configuration()); | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This should use the helper methods in BaseCommand. Those helpers make arguments into paths how users expect to interact with a CLI utility. For example, "/tmp/file.parquet" is opened in the local FS, not the default FS. |
||
| if (!showColumnIndex && !showOffsetIndex) { | ||
| showColumnIndex = showOffsetIndex = true; | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Nit: it is more clear to use separate assignment because this is one character away from assigning the value of a boolean test. |
||
| } | ||
|
|
||
| try (ParquetFileReader reader = ParquetFileReader.open(in)) { | ||
| boolean firstBlock = true; | ||
| for (Entry<Integer, BlockMetaData> entry : getBlocks(reader.getFooter())) { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Minor: It is odd to me that |
||
| if (!firstBlock) { | ||
| console.info(""); | ||
| } | ||
| firstBlock = false; | ||
| console.info("row group {}:", entry.getKey()); | ||
| for (ColumnChunkMetaData column : getColumns(entry.getValue())) { | ||
| String path = column.getPath().toDotString(); | ||
| if (showColumnIndex) { | ||
| console.info("column index for column {}:", path); | ||
| ColumnIndex columnIndex = reader.readColumnIndex(column); | ||
| if (columnIndex == null) { | ||
| console.info("NONE"); | ||
| } else { | ||
| console.info(columnIndex.toString()); | ||
| } | ||
| } | ||
| if (showOffsetIndex) { | ||
| console.info("offset index for column {}:", path); | ||
| OffsetIndex offsetIndex = reader.readOffsetIndex(column); | ||
| if (offsetIndex == null) { | ||
| console.info("NONE"); | ||
| } else { | ||
| console.info(offsetIndex.toString()); | ||
| } | ||
| } | ||
| } | ||
| } | ||
| } | ||
| return 0; | ||
| } | ||
|
|
||
| // Returns the index-block pairs based on the arguments of --block | ||
| private List<Entry<Integer, BlockMetaData>> getBlocks(ParquetMetadata meta) { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Minor: It would be better to return a map instead. I think it's a bad practice to use Entry outside of a map because you need a Pair. |
||
| List<BlockMetaData> blocks = meta.getBlocks(); | ||
| List<Entry<Integer, BlockMetaData>> pairs = new ArrayList<>(); | ||
| if (blockIndexes == null || blockIndexes.isEmpty()) { | ||
| int index = 0; | ||
| for (BlockMetaData block : blocks) { | ||
| pairs.add(new AbstractMap.SimpleImmutableEntry<>(index++, block)); | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Nit: Using the return value of a ++ expression makes the code harder to read. Statements that set variables should be independent. |
||
| } | ||
| } else { | ||
| for (String indexStr : blockIndexes) { | ||
| int index = Integer.parseInt(indexStr); | ||
| pairs.add(new AbstractMap.SimpleImmutableEntry<>(index, blocks.get(index))); | ||
| } | ||
| } | ||
| return pairs; | ||
| } | ||
|
|
||
| private List<ColumnChunkMetaData> getColumns(BlockMetaData block) { | ||
| List<ColumnChunkMetaData> columns = block.getColumns(); | ||
| if (ColumnPaths == null || ColumnPaths.isEmpty()) { | ||
| return columns; | ||
| } | ||
| Map<String, ColumnChunkMetaData> pathMap = new HashMap<>(); | ||
| for (ColumnChunkMetaData column : columns) { | ||
| pathMap.put(column.getPath().toDotString(), column); | ||
| } | ||
|
|
||
| List<ColumnChunkMetaData> filtered = new ArrayList<>(); | ||
| for (String path : ColumnPaths) { | ||
| ColumnChunkMetaData column = pathMap.get(path); | ||
| if (column != null) { | ||
| filtered.add(column); | ||
| } | ||
| } | ||
| return filtered; | ||
| } | ||
|
|
||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -72,7 +72,13 @@ public ColumnReadStoreImpl(PageReadStore pageReadStore, | |
|
|
||
| @Override | ||
| public ColumnReader getColumnReader(ColumnDescriptor path) { | ||
| return newMemColumnReader(path, pageReadStore.getPageReader(path)); | ||
| PrimitiveConverter converter = getPrimitiveConverter(path); | ||
| PageReader pageReader = pageReadStore.getPageReader(path); | ||
| if (pageReadStore.isInPageFilteringMode()) { | ||
| return new SynchronizingColumnReader(path, pageReader, converter, writerVersion, pageReadStore.getRowIndexes()); | ||
| } else { | ||
| return new ColumnReaderImpl(path, pageReader, converter, writerVersion); | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why doesn't this use
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||
| } | ||
| } | ||
|
|
||
| public ColumnReaderImpl newMemColumnReader(ColumnDescriptor path, PageReader pageReader) { | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
User-facing options should always use "row group" and never "block" because block is used in several different contexts and is confusing. Row group is always clear.