apache · zivanfi · Oct 18, 2018 · May 17, 2018 · May 22, 2018 · May 28, 2018
diff --git a/parquet-cli/src/main/java/org/apache/parquet/cli/Main.java b/parquet-cli/src/main/java/org/apache/parquet/cli/Main.java
@@ -32,6 +32,7 @@
 import org.apache.parquet.cli.commands.ConvertCommand;
 import org.apache.parquet.cli.commands.ParquetMetadataCommand;
 import org.apache.parquet.cli.commands.SchemaCommand;
+import org.apache.parquet.cli.commands.ShowColumnIndexCommand;
 import org.apache.parquet.cli.commands.ShowDictionaryCommand;
 import org.apache.parquet.cli.commands.ShowPagesCommand;
 import org.apache.parquet.cli.commands.ToAvroCommand;
@@ -87,6 +88,7 @@ public class Main extends Configured implements Tool {
     jc.addCommand("to-avro", new ToAvroCommand(console));
     jc.addCommand("cat", new CatCommand(console, 0));
     jc.addCommand("head", new CatCommand(console, 10));
+    jc.addCommand("column-index", new ShowColumnIndexCommand(console));
   }
 
   @Override

diff --git a/parquet-cli/src/main/java/org/apache/parquet/cli/commands/ShowColumnIndexCommand.java b/parquet-cli/src/main/java/org/apache/parquet/cli/commands/ShowColumnIndexCommand.java
@@ -0,0 +1,166 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.parquet.cli.commands;
+
+import java.io.IOException;
+import java.util.AbstractMap;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.Map.Entry;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.Path;
+import org.apache.parquet.cli.BaseCommand;
+import org.apache.parquet.hadoop.ParquetFileReader;
+import org.apache.parquet.hadoop.metadata.BlockMetaData;
+import org.apache.parquet.hadoop.metadata.ColumnChunkMetaData;
+import org.apache.parquet.hadoop.metadata.ParquetMetadata;
+import org.apache.parquet.hadoop.util.HadoopInputFile;
+import org.apache.parquet.internal.column.columnindex.ColumnIndex;
+import org.apache.parquet.internal.column.columnindex.OffsetIndex;
+import org.apache.parquet.io.InputFile;
+import org.slf4j.Logger;
+
+import com.beust.jcommander.Parameter;
+import com.beust.jcommander.Parameters;
+import com.google.common.base.Preconditions;
+import com.google.common.collect.Lists;
+
+/**
+ * parquet-cli command to print column and offset indexes.
+ */
+@Parameters(commandDescription = "Prints the column and offset indexes of a Parquet file")
+public class ShowColumnIndexCommand extends BaseCommand {
+  public ShowColumnIndexCommand(Logger console) {
+    super(console);
+  }
+
+  @Parameter(description = "<parquet path>")
+  List<String> files;
+
+  @Parameter(names = { "-c", "--column" }, description = "Shows the column/offset indexes for the given column only")
+  List<String> ColumnPaths;
+
+  @Parameter(names = { "-b",
+      "--block" }, description = "Shows the column/offset indexes for the given block (row-group) only; "
+          + "blocks are referenced by their indexes from 0")
+  List<String> blockIndexes;
+
+  @Parameter(names = { "-i", "--column-index" }, description = "Shows the column indexes; "
+      + "active by default unless -o is used")
+  boolean showColumnIndex;
+
+  @Parameter(names = { "-o", "--offset-index" }, description = "Shows the offset indexes; "
+      + "active by default unless -i is used")
+  boolean showOffsetIndex;
+
+  @Override
+  public List<String> getExamples() {
+    return Lists.newArrayList(
+        "# Show only column indexes for column 'col' from a Parquet file",
+        "-c col -i sample.parquet");
+  }
+
+  @Override
+  public int run() throws IOException {
+    Preconditions.checkArgument(files != null && files.size() >= 1,
+        "A Parquet file is required.");
+    Preconditions.checkArgument(files.size() == 1,
+        "Cannot process multiple Parquet files.");
+
+    InputFile in = HadoopInputFile.fromPath(new Path(files.get(0)), new Configuration());
+    if (!showColumnIndex && !showOffsetIndex) {
+      showColumnIndex = showOffsetIndex = true;
+    }
+
+    try (ParquetFileReader reader = ParquetFileReader.open(in)) {
+      boolean firstBlock = true;
+      for (Entry<Integer, BlockMetaData> entry : getBlocks(reader.getFooter())) {
+        if (!firstBlock) {
+          console.info("");
+        }
+        firstBlock = false;
+        console.info("row group {}:", entry.getKey());
+        for (ColumnChunkMetaData column : getColumns(entry.getValue())) {
+          String path = column.getPath().toDotString();
+          if (showColumnIndex) {
+            console.info("column index for column {}:", path);
+            ColumnIndex columnIndex = reader.readColumnIndex(column);
+            if (columnIndex == null) {
+              console.info("NONE");
+            } else {
+              console.info(columnIndex.toString());
+            }
+          }
+          if (showOffsetIndex) {
+            console.info("offset index for column {}:", path);
+            OffsetIndex offsetIndex = reader.readOffsetIndex(column);
+            if (offsetIndex == null) {
+              console.info("NONE");
+            } else {
+              console.info(offsetIndex.toString());
+            }
+          }
+        }
+      }
+    }
+    return 0;
+  }
+
+  // Returns the index-block pairs based on the arguments of --block
+  private List<Entry<Integer, BlockMetaData>> getBlocks(ParquetMetadata meta) {
+    List<BlockMetaData> blocks = meta.getBlocks();
+    List<Entry<Integer, BlockMetaData>> pairs = new ArrayList<>();
+    if (blockIndexes == null || blockIndexes.isEmpty()) {
+      int index = 0;
+      for (BlockMetaData block : blocks) {
+        pairs.add(new AbstractMap.SimpleImmutableEntry<>(index++, block));
+      }
+    } else {
+      for (String indexStr : blockIndexes) {
+        int index = Integer.parseInt(indexStr);
+        pairs.add(new AbstractMap.SimpleImmutableEntry<>(index, blocks.get(index)));
+      }
+    }
+    return pairs;
+  }
+
+  private List<ColumnChunkMetaData> getColumns(BlockMetaData block) {
+    List<ColumnChunkMetaData> columns = block.getColumns();
+    if (ColumnPaths == null || ColumnPaths.isEmpty()) {
+      return columns;
+    }
+    Map<String, ColumnChunkMetaData> pathMap = new HashMap<>();
+    for (ColumnChunkMetaData column : columns) {
+      pathMap.put(column.getPath().toDotString(), column);
+    }
+
+    List<ColumnChunkMetaData> filtered = new ArrayList<>();
+    for (String path : ColumnPaths) {
+      ColumnChunkMetaData column = pathMap.get(path);
+      if (column != null) {
+        filtered.add(column);
+      }
+    }
+    return filtered;
+  }
+
+}
diff --git a/parquet-column/src/main/java/org/apache/parquet/column/ColumnReader.java b/parquet-column/src/main/java/org/apache/parquet/column/ColumnReader.java
@@ -41,7 +41,10 @@ public interface ColumnReader {
 
   /**
    * @return the totalCount of values to be consumed
+   * @deprecated will be removed in 2.0.0; Total values might not be able to be counted before reading the values (e.g.
+   *             in case of column index based filtering)
    */
+  @Deprecated
   long getTotalValueCount();
 
   /**

diff --git a/parquet-column/src/main/java/org/apache/parquet/column/ParquetProperties.java b/parquet-column/src/main/java/org/apache/parquet/column/ParquetProperties.java
@@ -47,6 +47,7 @@ public class ParquetProperties {
   public static final boolean DEFAULT_ESTIMATE_ROW_COUNT_FOR_PAGE_SIZE_CHECK = true;
   public static final int DEFAULT_MINIMUM_RECORD_COUNT_FOR_CHECK = 100;
   public static final int DEFAULT_MAXIMUM_RECORD_COUNT_FOR_CHECK = 10000;
+  public static final int DEFAULT_COLUMN_INDEX_TRUNCATE_LENGTH = 64;
 
   public static final ValuesWriterFactory DEFAULT_VALUES_WRITER_FACTORY = new DefaultValuesWriterFactory();
 
@@ -83,10 +84,11 @@ public static WriterVersion fromString(String name) {
   private final boolean estimateNextSizeCheck;
   private final ByteBufferAllocator allocator;
   private final ValuesWriterFactory valuesWriterFactory;
+  private final int columnIndexTruncateLength;
 
   private ParquetProperties(WriterVersion writerVersion, int pageSize, int dictPageSize, boolean enableDict, int minRowCountForPageSizeCheck,
                             int maxRowCountForPageSizeCheck, boolean estimateNextSizeCheck, ByteBufferAllocator allocator,
-                            ValuesWriterFactory writerFactory) {
+                            ValuesWriterFactory writerFactory, int columnIndexMinMaxTruncateLength) {
     this.pageSizeThreshold = pageSize;
     this.initialSlabSize = CapacityByteArrayOutputStream
       .initialSlabSizeHeuristic(MIN_SLAB_SIZE, pageSizeThreshold, 10);
@@ -99,6 +101,7 @@ private ParquetProperties(WriterVersion writerVersion, int pageSize, int dictPag
     this.allocator = allocator;
 
     this.valuesWriterFactory = writerFactory;
+    this.columnIndexTruncateLength = columnIndexMinMaxTruncateLength;
   }
 
   public ValuesWriter newRepetitionLevelWriter(ColumnDescriptor path) {
@@ -163,7 +166,7 @@ public ColumnWriteStore newColumnWriteStore(MessageType schema,
                                               PageWriteStore pageStore) {
     switch (writerVersion) {
     case PARQUET_1_0:
-      return new ColumnWriteStoreV1(pageStore, this);
+      return new ColumnWriteStoreV1(schema, pageStore, this);
     case PARQUET_2_0:
       return new ColumnWriteStoreV2(schema, pageStore, this);
     default:
@@ -183,6 +186,10 @@ public ValuesWriterFactory getValuesWriterFactory() {
     return valuesWriterFactory;
   }
 
+  public int getColumnIndexTruncateLength() {
+    return columnIndexTruncateLength;
+  }
+
   public boolean estimateNextSizeCheck() {
     return estimateNextSizeCheck;
   }
@@ -205,6 +212,7 @@ public static class Builder {
     private boolean estimateNextSizeCheck = DEFAULT_ESTIMATE_ROW_COUNT_FOR_PAGE_SIZE_CHECK;
     private ByteBufferAllocator allocator = new HeapByteBufferAllocator();
     private ValuesWriterFactory valuesWriterFactory = DEFAULT_VALUES_WRITER_FACTORY;
+    private int columnIndexTruncateLength = DEFAULT_COLUMN_INDEX_TRUNCATE_LENGTH;
 
     private Builder() {
     }
@@ -299,11 +307,17 @@ public Builder withValuesWriterFactory(ValuesWriterFactory factory) {
       return this;
     }
 
+    public Builder withColumnIndexTruncateLength(int length) {
+      Preconditions.checkArgument(length > 0, "Invalid column index min/max truncate length (negative) : %s", length);
+      this.columnIndexTruncateLength = length;
+      return this;
+    }
+
     public ParquetProperties build() {
       ParquetProperties properties =
         new ParquetProperties(writerVersion, pageSize, dictPageSize,
           enableDict, minRowCountForPageSizeCheck, maxRowCountForPageSizeCheck,
-          estimateNextSizeCheck, allocator, valuesWriterFactory);
+          estimateNextSizeCheck, allocator, valuesWriterFactory, columnIndexTruncateLength);
       // we pass a constructed but uninitialized factory to ParquetProperties above as currently
       // creation of ValuesWriters is invoked from within ParquetProperties. In the future
       // we'd like to decouple that and won't need to pass an object to properties and then pass the

diff --git a/parquet-column/src/main/java/org/apache/parquet/column/impl/ColumnReadStoreImpl.java b/parquet-column/src/main/java/org/apache/parquet/column/impl/ColumnReadStoreImpl.java
@@ -72,7 +72,13 @@ public ColumnReadStoreImpl(PageReadStore pageReadStore,
 
   @Override
   public ColumnReader getColumnReader(ColumnDescriptor path) {
-    return newMemColumnReader(path, pageReadStore.getPageReader(path));
+    PrimitiveConverter converter = getPrimitiveConverter(path);
+    PageReader pageReader = pageReadStore.getPageReader(path);
+    if (pageReadStore.isInPageFilteringMode()) {
+      return new SynchronizingColumnReader(path, pageReader, converter, writerVersion, pageReadStore.getRowIndexes());
+    } else {
+      return new ColumnReaderImpl(path, pageReader, converter, writerVersion);
+    }
   }
 
   public ColumnReaderImpl newMemColumnReader(ColumnDescriptor path, PageReader pageReader) {