diff --git a/parquet-cli/src/main/java/org/apache/parquet/cli/Main.java b/parquet-cli/src/main/java/org/apache/parquet/cli/Main.java index 990193c731..fa69ce7a40 100644 --- a/parquet-cli/src/main/java/org/apache/parquet/cli/Main.java +++ b/parquet-cli/src/main/java/org/apache/parquet/cli/Main.java @@ -32,6 +32,7 @@ import org.apache.parquet.cli.commands.ConvertCommand; import org.apache.parquet.cli.commands.ParquetMetadataCommand; import org.apache.parquet.cli.commands.SchemaCommand; +import org.apache.parquet.cli.commands.ShowColumnIndexCommand; import org.apache.parquet.cli.commands.ShowDictionaryCommand; import org.apache.parquet.cli.commands.ShowPagesCommand; import org.apache.parquet.cli.commands.ToAvroCommand; @@ -87,6 +88,7 @@ public class Main extends Configured implements Tool { jc.addCommand("to-avro", new ToAvroCommand(console)); jc.addCommand("cat", new CatCommand(console, 0)); jc.addCommand("head", new CatCommand(console, 10)); + jc.addCommand("column-index", new ShowColumnIndexCommand(console)); } @Override diff --git a/parquet-cli/src/main/java/org/apache/parquet/cli/commands/ShowColumnIndexCommand.java b/parquet-cli/src/main/java/org/apache/parquet/cli/commands/ShowColumnIndexCommand.java new file mode 100644 index 0000000000..0407a8d0cb --- /dev/null +++ b/parquet-cli/src/main/java/org/apache/parquet/cli/commands/ShowColumnIndexCommand.java @@ -0,0 +1,166 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.parquet.cli.commands; + +import java.io.IOException; +import java.util.AbstractMap; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Map.Entry; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.Path; +import org.apache.parquet.cli.BaseCommand; +import org.apache.parquet.hadoop.ParquetFileReader; +import org.apache.parquet.hadoop.metadata.BlockMetaData; +import org.apache.parquet.hadoop.metadata.ColumnChunkMetaData; +import org.apache.parquet.hadoop.metadata.ParquetMetadata; +import org.apache.parquet.hadoop.util.HadoopInputFile; +import org.apache.parquet.internal.column.columnindex.ColumnIndex; +import org.apache.parquet.internal.column.columnindex.OffsetIndex; +import org.apache.parquet.io.InputFile; +import org.slf4j.Logger; + +import com.beust.jcommander.Parameter; +import com.beust.jcommander.Parameters; +import com.google.common.base.Preconditions; +import com.google.common.collect.Lists; + +/** + * parquet-cli command to print column and offset indexes. + */ +@Parameters(commandDescription = "Prints the column and offset indexes of a Parquet file") +public class ShowColumnIndexCommand extends BaseCommand { + public ShowColumnIndexCommand(Logger console) { + super(console); + } + + @Parameter(description = "") + List files; + + @Parameter(names = { "-c", "--column" }, description = "Shows the column/offset indexes for the given column only") + List ColumnPaths; + + @Parameter(names = { "-b", + "--block" }, description = "Shows the column/offset indexes for the given block (row-group) only; " + + "blocks are referenced by their indexes from 0") + List blockIndexes; + + @Parameter(names = { "-i", "--column-index" }, description = "Shows the column indexes; " + + "active by default unless -o is used") + boolean showColumnIndex; + + @Parameter(names = { "-o", "--offset-index" }, description = "Shows the offset indexes; " + + "active by default unless -i is used") + boolean showOffsetIndex; + + @Override + public List getExamples() { + return Lists.newArrayList( + "# Show only column indexes for column 'col' from a Parquet file", + "-c col -i sample.parquet"); + } + + @Override + public int run() throws IOException { + Preconditions.checkArgument(files != null && files.size() >= 1, + "A Parquet file is required."); + Preconditions.checkArgument(files.size() == 1, + "Cannot process multiple Parquet files."); + + InputFile in = HadoopInputFile.fromPath(new Path(files.get(0)), new Configuration()); + if (!showColumnIndex && !showOffsetIndex) { + showColumnIndex = showOffsetIndex = true; + } + + try (ParquetFileReader reader = ParquetFileReader.open(in)) { + boolean firstBlock = true; + for (Entry entry : getBlocks(reader.getFooter())) { + if (!firstBlock) { + console.info(""); + } + firstBlock = false; + console.info("row group {}:", entry.getKey()); + for (ColumnChunkMetaData column : getColumns(entry.getValue())) { + String path = column.getPath().toDotString(); + if (showColumnIndex) { + console.info("column index for column {}:", path); + ColumnIndex columnIndex = reader.readColumnIndex(column); + if (columnIndex == null) { + console.info("NONE"); + } else { + console.info(columnIndex.toString()); + } + } + if (showOffsetIndex) { + console.info("offset index for column {}:", path); + OffsetIndex offsetIndex = reader.readOffsetIndex(column); + if (offsetIndex == null) { + console.info("NONE"); + } else { + console.info(offsetIndex.toString()); + } + } + } + } + } + return 0; + } + + // Returns the index-block pairs based on the arguments of --block + private List> getBlocks(ParquetMetadata meta) { + List blocks = meta.getBlocks(); + List> pairs = new ArrayList<>(); + if (blockIndexes == null || blockIndexes.isEmpty()) { + int index = 0; + for (BlockMetaData block : blocks) { + pairs.add(new AbstractMap.SimpleImmutableEntry<>(index++, block)); + } + } else { + for (String indexStr : blockIndexes) { + int index = Integer.parseInt(indexStr); + pairs.add(new AbstractMap.SimpleImmutableEntry<>(index, blocks.get(index))); + } + } + return pairs; + } + + private List getColumns(BlockMetaData block) { + List columns = block.getColumns(); + if (ColumnPaths == null || ColumnPaths.isEmpty()) { + return columns; + } + Map pathMap = new HashMap<>(); + for (ColumnChunkMetaData column : columns) { + pathMap.put(column.getPath().toDotString(), column); + } + + List filtered = new ArrayList<>(); + for (String path : ColumnPaths) { + ColumnChunkMetaData column = pathMap.get(path); + if (column != null) { + filtered.add(column); + } + } + return filtered; + } + +} diff --git a/parquet-tools/src/main/java/org/apache/parquet/tools/command/ColumnIndexCommand.java b/parquet-tools/src/main/java/org/apache/parquet/tools/command/ColumnIndexCommand.java new file mode 100644 index 0000000000..f31599a4fd --- /dev/null +++ b/parquet-tools/src/main/java/org/apache/parquet/tools/command/ColumnIndexCommand.java @@ -0,0 +1,190 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.parquet.tools.command; + +import java.io.PrintWriter; +import java.util.AbstractMap; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Map.Entry; + +import org.apache.commons.cli.CommandLine; +import org.apache.commons.cli.Option; +import org.apache.commons.cli.Options; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.Path; +import org.apache.parquet.internal.column.columnindex.ColumnIndex; +import org.apache.parquet.internal.column.columnindex.OffsetIndex; +import org.apache.parquet.hadoop.ParquetFileReader; +import org.apache.parquet.hadoop.metadata.BlockMetaData; +import org.apache.parquet.hadoop.metadata.ColumnChunkMetaData; +import org.apache.parquet.hadoop.metadata.ParquetMetadata; +import org.apache.parquet.hadoop.util.HadoopInputFile; +import org.apache.parquet.io.InputFile; +import org.apache.parquet.tools.Main; + +/** + * parquet-tools command to print column and offset indexes. + */ +public class ColumnIndexCommand extends ArgsOnlyCommand { + public static final String[] USAGE = new String[] { + "", + "where is the parquet file to print the column and offset indexes for" + }; + + public static final Options OPTIONS; + static { + OPTIONS = new Options(); + OPTIONS.addOption(Option.builder("c") + .longOpt("column") + .desc("Shows the column/offset indexes for the given column only; " + + "multiple columns shall be separated by commas") + .hasArg() + .build()); + OPTIONS.addOption(Option.builder("b") + .longOpt("block") + .desc("Shows the column/offset indexes for the given block (row-group) only; " + + "multiple blocks shall be speparated by commas; " + + "blocks are referenced by their indexes from 0") + .hasArg() + .build()); + OPTIONS.addOption(Option.builder("i") + .longOpt("column-index") + .desc("Shows the column indexes; " + + "active by default unless -o is used") + .hasArg(false) + .build()); + OPTIONS.addOption(Option.builder("o") + .longOpt("offset-index") + .desc("Shows the offset indexes; " + + "active by default unless -i is used") + .hasArg(false) + .build()); + } + + public ColumnIndexCommand() { + super(1, 1); + } + + @Override + public String[] getUsageDescription() { + return USAGE; + } + + @Override + public String getCommandDescription() { + return "Prints the column and offset indexes of a Parquet file."; + } + + @Override + public Options getOptions() { + return OPTIONS; + } + + @Override + public void execute(CommandLine options) throws Exception { + super.execute(options); + + String[] args = options.getArgs(); + InputFile in = HadoopInputFile.fromPath(new Path(args[0]), new Configuration()); + PrintWriter out = new PrintWriter(Main.out, true); + String blockValue = options.getOptionValue("b"); + String[] indexes = blockValue == null ? null : blockValue.split("\\s*,\\s*"); + boolean showColumnIndex = options.hasOption("i"); + boolean showOffsetIndex = options.hasOption("o"); + if (!showColumnIndex && !showOffsetIndex) { + showColumnIndex = showOffsetIndex = true; + } + + try (ParquetFileReader reader = ParquetFileReader.open(in)) { + boolean firstBlock = true; + for (Entry entry : getBlocks(reader.getFooter(), indexes)) { + if (!firstBlock) { + out.println(); + } + firstBlock = false; + out.format("row group %d:%n", entry.getKey()); + for (ColumnChunkMetaData column : getColumns(entry.getValue(), options)) { + String path = column.getPath().toDotString(); + if (showColumnIndex) { + out.format("column index for column %s:%n", path); + ColumnIndex columnIndex = reader.readColumnIndex(column); + if (columnIndex == null) { + out.println("NONE"); + } else { + out.println(columnIndex); + } + } + if (showOffsetIndex) { + out.format("offset index for column %s:%n", path); + OffsetIndex offsetIndex = reader.readOffsetIndex(column); + if (offsetIndex == null) { + out.println("NONE"); + } else { + out.println(offsetIndex); + } + } + } + } + } + } + + // Returns the index-block pairs based on the arguments of --block + private static List> getBlocks(ParquetMetadata meta, String[] indexes) { + List blocks = meta.getBlocks(); + List> pairs = new ArrayList<>(); + if (indexes == null) { + int index = 0; + for (BlockMetaData block : blocks) { + pairs.add(new AbstractMap.SimpleImmutableEntry<>(index++, block)); + } + } else { + for (String indexStr : indexes) { + int index = Integer.parseInt(indexStr); + pairs.add(new AbstractMap.SimpleImmutableEntry<>(index, blocks.get(index))); + } + } + return pairs; + } + + private static List getColumns(BlockMetaData block, CommandLine options) { + List columns = block.getColumns(); + String pathValue = options.getOptionValue("c"); + if (pathValue == null) { + return columns; + } + String[] paths = pathValue.split("\\s*,\\s*"); + Map pathMap = new HashMap<>(); + for (ColumnChunkMetaData column : columns) { + pathMap.put(column.getPath().toDotString(), column); + } + + List filtered = new ArrayList<>(); + for (String path : paths) { + ColumnChunkMetaData column = pathMap.get(path); + if (column != null) { + filtered.add(column); + } + } + return filtered; + } + +} diff --git a/parquet-tools/src/main/java/org/apache/parquet/tools/command/Registry.java b/parquet-tools/src/main/java/org/apache/parquet/tools/command/Registry.java index 6df84be37a..399efb7316 100644 --- a/parquet-tools/src/main/java/org/apache/parquet/tools/command/Registry.java +++ b/parquet-tools/src/main/java/org/apache/parquet/tools/command/Registry.java @@ -34,6 +34,7 @@ public final class Registry { registry.put("merge", MergeCommand.class); registry.put("rowcount", RowCountCommand.class); registry.put("size", SizeCommand.class); + registry.put("column-index", ColumnIndexCommand.class); } public static Map allCommands() {