Skip to content

Commit

Permalink
fix parquet-hive
Browse files Browse the repository at this point in the history
  • Loading branch information
julienledem committed Sep 5, 2014
1 parent 24a2050 commit ccdd08c
Show file tree
Hide file tree
Showing 3 changed files with 77 additions and 38 deletions.
50 changes: 48 additions & 2 deletions parquet-hadoop/src/main/java/parquet/hadoop/ParquetInputSplit.java
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
import java.io.IOException;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.zip.GZIPInputStream;
Expand All @@ -35,6 +36,7 @@
import org.apache.hadoop.mapreduce.lib.input.FileSplit;

import parquet.bytes.BytesUtils;
import parquet.hadoop.metadata.BlockMetaData;

/**
* An input split for the Parquet format
Expand All @@ -61,17 +63,61 @@ public ParquetInputSplit() {
super(null, 0, 0, new String[0]);
}

/**
* For compatibility only
* use {@link ParquetInputSplit#ParquetInputSplit(Path, long, long, long, String[], long[], String, Map)}
* @param path
* @param start
* @param length
* @param hosts
* @param blocks
* @param requestedSchema
* @param fileSchema
* @param extraMetadata
* @param readSupportMetadata
*/
@Deprecated
public ParquetInputSplit(
Path path,
long start,
long length,
String[] hosts,
List<BlockMetaData> blocks,
String requestedSchema,
String fileSchema,
Map<String, String> extraMetadata,
Map<String, String> readSupportMetadata) {
this(
path, start, length, end(blocks), hosts,
offsets(blocks),
requestedSchema, readSupportMetadata
);
}

private static long end(List<BlockMetaData> blocks) {
BlockMetaData last = blocks.get(blocks.size() - 1);
return last.getStartingPos() + last.getCompressedSize();
}

private static long[] offsets(List<BlockMetaData> blocks) {
long[] offsets = new long[blocks.size()];
for (int i = 0; i < offsets.length; i++) {
offsets[i] = blocks.get(0).getStartingPos();
}
return offsets;
}

/**
* @param file the path of the file for that split
* @param start the strat offset in the file
* @param start the start offset in the file
* @param end the end offset in the file
* @param length the actual size in bytes that we expect to read
* @param hosts the hosts with the replicas of this data
* @param rowGroupOffsets the offsets of the rowgroups selected if loaded on the client
* @param requestedSchema the user requested schema
* @param readSupportMetadata metadata from the read support
*/
ParquetInputSplit(
public ParquetInputSplit(
Path file, long start, long end, long length, String[] hosts,
long[] rowGroupOffsets,
String requestedSchema,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
import java.util.Arrays;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;

import org.apache.hadoop.conf.Configuration;
Expand All @@ -42,6 +43,7 @@
import parquet.hadoop.metadata.ParquetMetadata;
import parquet.hadoop.util.ContextUtil;
import parquet.hadoop.util.counters.BenchmarkCounter;
import parquet.schema.MessageType;
import parquet.schema.MessageTypeParser;

/**
Expand Down Expand Up @@ -145,10 +147,11 @@ private void initializeInternalReader(ParquetInputSplit split, Configuration con
long[] rowGroupOffsets = split.getRowGroupOffsets();
List<BlockMetaData> filteredBlocks;
// if task.side.metadata is set, rowGroupOffsets is null
MessageType fileSchema = footer.getFileMetaData().getSchema();
if (rowGroupOffsets == null) {
// then we need to apply the predicate push down filter
Filter filter = ParquetInputFormat.getFilter(configuration);
filteredBlocks = RowGroupFilter.filterRowGroups(filter, footer.getBlocks(), footer.getFileMetaData().getSchema());
filteredBlocks = RowGroupFilter.filterRowGroups(filter, footer.getBlocks(), fileSchema);
} else {
// otherwise we find the row groups that were selected on the client
Set<Long> offsets = new HashSet<Long>();
Expand Down Expand Up @@ -177,10 +180,13 @@ private void initializeInternalReader(ParquetInputSplit split, Configuration con
+ " in range " + split.getStart() + ", " + split.getEnd());
}
}
MessageType requestedSchema = MessageTypeParser.parseMessageType(split.getRequestedSchema());
Map<String, String> fileMetaData = footer.getFileMetaData().getKeyValueMetaData();
Map<String, String> readSupportMetadata = split.getReadSupportMetadata();
internalReader.initialize(
MessageTypeParser.parseMessageType(split.getRequestedSchema()),
footer.getFileMetaData().getSchema(),
footer.getFileMetaData().getKeyValueMetaData(), split.getReadSupportMetadata(), path,
requestedSchema,fileSchema,
fileMetaData, readSupportMetadata,
path,
filteredBlocks, configuration);
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,9 @@
*/
package org.apache.hadoop.hive.ql.io.parquet.read;

import static parquet.format.converter.ParquetMetadataConverter.SKIP_ROW_GROUPS;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
Expand All @@ -35,7 +35,6 @@
import parquet.hadoop.ParquetInputFormat;
import parquet.hadoop.ParquetInputSplit;
import parquet.hadoop.api.ReadSupport.ReadContext;
import parquet.hadoop.metadata.BlockMetaData;
import parquet.hadoop.metadata.FileMetaData;
import parquet.hadoop.metadata.ParquetMetadata;
import parquet.hadoop.util.ContextUtil;
Expand Down Expand Up @@ -194,45 +193,33 @@ protected ParquetInputSplit getSplit(
final InputSplit oldSplit,
final JobConf conf
) throws IOException {
ParquetInputSplit split;
if (oldSplit instanceof FileSplit) {
final Path finalPath = ((FileSplit) oldSplit).getPath();
FileSplit fileSplit = (FileSplit) oldSplit;
final long splitStart = fileSplit.getStart();
final long splitLength = fileSplit.getLength();
final Path finalPath = fileSplit.getPath();
final JobConf cloneJob = hiveBinding.pushProjectionsAndFilters(conf, finalPath.getParent());

final ParquetMetadata parquetMetadata = ParquetFileReader.readFooter(cloneJob, finalPath);
final List<BlockMetaData> blocks = parquetMetadata.getBlocks();
final ParquetMetadata parquetMetadata = ParquetFileReader.readFooter(cloneJob, finalPath, SKIP_ROW_GROUPS);
final FileMetaData fileMetaData = parquetMetadata.getFileMetaData();

final ReadContext readContext = new DataWritableReadSupport()
.init(cloneJob, fileMetaData.getKeyValueMetaData(), fileMetaData.getSchema());
schemaSize = MessageTypeParser.parseMessageType(readContext.getReadSupportMetadata()
.get(DataWritableReadSupport.HIVE_SCHEMA_KEY)).getFieldCount();
final List<BlockMetaData> splitGroup = new ArrayList<BlockMetaData>();
final long splitStart = ((FileSplit) oldSplit).getStart();
final long splitLength = ((FileSplit) oldSplit).getLength();
for (final BlockMetaData block : blocks) {
final long firstDataPage = block.getColumns().get(0).getFirstDataPageOffset();
if (firstDataPage >= splitStart && firstDataPage < splitStart + splitLength) {
splitGroup.add(block);
}
}
if (splitGroup.isEmpty()) {
LOG.warn("Skipping split, could not find row group in: " + (FileSplit) oldSplit);
split = null;
} else {
split = new ParquetInputSplit(finalPath,
final ReadContext readContext =
new DataWritableReadSupport()
.init(cloneJob, fileMetaData.getKeyValueMetaData(), fileMetaData.getSchema());

schemaSize = MessageTypeParser.parseMessageType(
readContext.getReadSupportMetadata().get(DataWritableReadSupport.HIVE_SCHEMA_KEY)
).getFieldCount();
return new ParquetInputSplit(
finalPath,
splitStart,
splitStart + splitLength,
splitLength,
((FileSplit) oldSplit).getLocations(),
splitGroup,
fileSplit.getLocations(),
null,
readContext.getRequestedSchema().toString(),
fileMetaData.getSchema().toString(),
fileMetaData.getKeyValueMetaData(),
readContext.getReadSupportMetadata());
}
} else {
throw new IllegalArgumentException("Unknown split type: " + oldSplit);
}
return split;
}
}

0 comments on commit ccdd08c

Please sign in to comment.