Skip to content

Commit

Permalink
apacheGH-3125: Add CLI for SizeStatistics
Browse files Browse the repository at this point in the history
  • Loading branch information
wgtmac committed Jan 21, 2025
1 parent d5f86d7 commit c3213a0
Show file tree
Hide file tree
Showing 6 changed files with 197 additions and 6 deletions.
2 changes: 2 additions & 0 deletions parquet-cli/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -119,6 +119,8 @@ Usage: parquet [options] [command] [command options]
Scan all records from a file
rewrite
Rewrite one or more Parquet files to a new Parquet file
size-stats
Print size statistics for a Parquet file
Examples:
Expand Down
2 changes: 2 additions & 0 deletions parquet-cli/src/main/java/org/apache/parquet/cli/Main.java
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@
import org.apache.parquet.cli.commands.ShowDictionaryCommand;
import org.apache.parquet.cli.commands.ShowFooterCommand;
import org.apache.parquet.cli.commands.ShowPagesCommand;
import org.apache.parquet.cli.commands.ShowSizeStatisticsCommand;
import org.apache.parquet.cli.commands.ToAvroCommand;
import org.apache.parquet.cli.commands.TransCompressionCommand;
import org.slf4j.Logger;
Expand Down Expand Up @@ -105,6 +106,7 @@ public class Main extends Configured implements Tool {
jc.addCommand("bloom-filter", new ShowBloomFilterCommand(console));
jc.addCommand("scan", new ScanCommand(console));
jc.addCommand("rewrite", new RewriteCommand(console));
jc.addCommand("size-stats", new ShowSizeStatisticsCommand(console));
}

@Override
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/

package org.apache.parquet.cli.commands;

import static org.apache.parquet.cli.Util.humanReadable;

import com.beust.jcommander.Parameter;
import com.beust.jcommander.Parameters;
import com.google.common.base.Preconditions;
import com.google.common.collect.Lists;
import java.io.IOException;
import java.util.List;
import org.apache.commons.text.TextStringBuilder;
import org.apache.parquet.cli.BaseCommand;
import org.apache.parquet.column.statistics.SizeStatistics;
import org.apache.parquet.hadoop.ParquetFileReader;
import org.apache.parquet.hadoop.metadata.BlockMetaData;
import org.apache.parquet.hadoop.metadata.ColumnChunkMetaData;
import org.apache.parquet.hadoop.metadata.ParquetMetadata;
import org.apache.parquet.schema.MessageType;
import org.slf4j.Logger;

@Parameters(commandDescription = "Print size statistics for a Parquet file")
public class ShowSizeStatisticsCommand extends BaseCommand {

public ShowSizeStatisticsCommand(Logger console) {
super(console);
}

@Parameter(description = "<parquet path>")
List<String> targets;

@Override
@SuppressWarnings("unchecked")
public int run() throws IOException {
Preconditions.checkArgument(targets != null && !targets.isEmpty(), "A Parquet file is required.");
Preconditions.checkArgument(targets.size() == 1, "Cannot process multiple Parquet files.");

String source = targets.get(0);
try (ParquetFileReader reader = ParquetFileReader.open(getConf(), qualifiedPath(source))) {
ParquetMetadata footer = reader.getFooter();
MessageType schema = footer.getFileMetaData().getSchema();

console.info("\nFile path: {}", source);

List<BlockMetaData> rowGroups = footer.getBlocks();
for (int index = 0, n = rowGroups.size(); index < n; index++) {
printRowGroupSizeStats(console, index, rowGroups.get(index), schema);
console.info("");
}
}

return 0;
}

private void printRowGroupSizeStats(Logger console, int index, BlockMetaData rowGroup, MessageType schema) {
int maxColumnWidth = Math.max(
"column".length(),
rowGroup.getColumns().stream()
.map(col -> col.getPath().toString().length())
.max(Integer::compare)
.orElse(0));

console.info(String.format("\nRow group %d\n%s", index, new TextStringBuilder(80).appendPadding(80, '-')));

String formatString = String.format("%%-%ds %%-15s %%-40s %%-40s", maxColumnWidth);
console.info(
String.format(formatString, "column", "unencoded bytes", "rep level histogram", "def level histogram"));

for (ColumnChunkMetaData column : rowGroup.getColumns()) {
printColumnSizeStats(console, column, schema, maxColumnWidth);
}
}

private void printColumnSizeStats(Logger console, ColumnChunkMetaData column, MessageType schema, int columnWidth) {
SizeStatistics stats = column.getSizeStatistics();

if (stats != null && stats.isValid()) {
String unencodedBytes = stats.getUnencodedByteArrayDataBytes().isPresent()
? humanReadable(stats.getUnencodedByteArrayDataBytes().get())
: "-";
List<Long> repLevels = stats.getRepetitionLevelHistogram();
String repLevelsString = (repLevels != null && !repLevels.isEmpty()) ? repLevels.toString() : "-";
List<Long> defLevels = stats.getDefinitionLevelHistogram();
String defLevelsString = (defLevels != null && !defLevels.isEmpty()) ? defLevels.toString() : "-";
String formatString = String.format("%%-%ds %%-15s %%-40s %%-40s", columnWidth);
console.info(
String.format(formatString, column.getPath(), unencodedBytes, repLevelsString, defLevelsString));
} else {
String formatString = String.format("%%-%ds %%-15s %%-40s %%-40s", columnWidth);
console.info(String.format(formatString, column.getPath(), "-", "-", "-"));
}
}

@Override
public List<String> getExamples() {
return Lists.newArrayList("# Show size statistics for a Parquet file", "sample.parquet");
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.parquet.cli.commands;

import java.io.File;
import java.io.IOException;
import java.util.Arrays;
import org.apache.hadoop.conf.Configuration;
import org.junit.Assert;
import org.junit.Test;

public class ShowSizeStatisticsCommandTest extends ParquetFileTest {
@Test
public void testShowSizeStatisticsCommand() throws IOException {
File file = parquetFile();
ShowSizeStatisticsCommand command = new ShowSizeStatisticsCommand(createLogger());
command.targets = Arrays.asList(file.getAbsolutePath());
command.setConf(new Configuration());
Assert.assertEquals(0, command.run());
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -182,14 +182,39 @@ public List<Long> getDefinitionLevelHistogram() {
return LongLists.unmodifiable(LongArrayList.wrap(defLevelHistogram));
}

private String formatHistogram(long[] histogram, int pageIndex) {
if (histogram != null && histogram.length > 0) {
int numLevelsPerPage = histogram.length / nullPages.length;
int offset = pageIndex * numLevelsPerPage;
StringBuilder sb = new StringBuilder();
sb.append('[');
for (int j = 0; j < numLevelsPerPage; j++) {
if (j > 0) {
sb.append(",");
}
sb.append(histogram[offset + j]);
}
sb.append(']');
return sb.toString();
}
return TOSTRING_MISSING_VALUE_MARKER;
}

@Override
public String toString() {
try (Formatter formatter = new Formatter()) {
formatter.format("Boundary order: %s\n", boundaryOrder);
String minMaxPart =
" %-" + MAX_VALUE_LENGTH_FOR_TOSTRING + "s %-" + MAX_VALUE_LENGTH_FOR_TOSTRING + "s\n";
formatter.format("%-10s %20s" + minMaxPart, "", "null count", "min", "max");
String format = "page-%-5d %20s" + minMaxPart;
" %-" + MAX_VALUE_LENGTH_FOR_TOSTRING + "s %-" + MAX_VALUE_LENGTH_FOR_TOSTRING + "s";
formatter.format(
"%-10s %20s" + minMaxPart + " %20s %20s\n",
"",
"null count",
"min",
"max",
"rep level histogram",
"def level histogram");
String format = "page-%-5d %20s" + minMaxPart + " %20s %20s\n";
int arrayIndex = 0;
for (int i = 0, n = nullPages.length; i < n; ++i) {
String nullCount =
Expand All @@ -201,7 +226,9 @@ public String toString() {
min = truncate(getMinValueAsString(arrayIndex));
max = truncate(getMaxValueAsString(arrayIndex++));
}
formatter.format(format, i, nullCount, min, max);
String repLevelHist = formatHistogram(repLevelHistogram, i);
String defLevelHist = formatHistogram(defLevelHistogram, i);
formatter.format(format, i, nullCount, min, max, repLevelHist, defLevelHist);
}
return formatter.toString();
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -39,10 +39,17 @@ private static class OffsetIndexImpl implements OffsetIndex {
@Override
public String toString() {
try (Formatter formatter = new Formatter()) {
formatter.format("%-10s %20s %16s %20s\n", "", "offset", "compressed size", "first row index");
formatter.format(
"%-10s %20s %20s %20s %20s\n",
"", "offset", "compressed size", "first row index", "unencoded bytes");
for (int i = 0, n = offsets.length; i < n; ++i) {
String unencodedBytes =
(unencodedByteArrayDataBytes != null && unencodedByteArrayDataBytes.length > 0)
? String.valueOf(unencodedByteArrayDataBytes[i])
: "-";
formatter.format(
"page-%-5d %20d %16d %20d\n", i, offsets[i], compressedPageSizes[i], firstRowIndexes[i]);
"page-%-5d %20d %20d %20d %20s\n",
i, offsets[i], compressedPageSizes[i], firstRowIndexes[i], unencodedBytes);
}
return formatter.toString();
}
Expand Down

0 comments on commit c3213a0

Please sign in to comment.