|
26 | 26 | import java.io.IOException; |
27 | 27 | import java.nio.charset.StandardCharsets; |
28 | 28 | import java.util.ArrayList; |
29 | | -import java.util.Arrays; |
30 | | -import java.util.Collections; |
31 | 29 | import java.util.HashMap; |
32 | 30 | import java.util.HashSet; |
33 | 31 | import java.util.LinkedHashSet; |
34 | 32 | import java.util.List; |
35 | 33 | import java.util.Map; |
36 | 34 | import java.util.Map.Entry; |
37 | | -import java.util.Optional; |
38 | 35 | import java.util.Set; |
39 | 36 |
|
40 | 37 | import org.apache.hadoop.conf.Configuration; |
|
45 | 42 | import org.apache.parquet.Preconditions; |
46 | 43 | import org.apache.parquet.Strings; |
47 | 44 | import org.apache.parquet.Version; |
48 | | -import org.apache.parquet.bytes.ByteBufferAllocator; |
49 | 45 | import org.apache.parquet.bytes.BytesInput; |
50 | 46 | import org.apache.parquet.bytes.BytesUtils; |
51 | | -import org.apache.parquet.bytes.HeapByteBufferAllocator; |
52 | 47 | import org.apache.parquet.column.ColumnDescriptor; |
53 | | -import org.apache.parquet.column.ColumnReader; |
54 | | -import org.apache.parquet.column.ColumnWriteStore; |
55 | | -import org.apache.parquet.column.ColumnWriter; |
56 | 48 | import org.apache.parquet.column.Encoding; |
57 | 49 | import org.apache.parquet.column.EncodingStats; |
58 | 50 | import org.apache.parquet.column.ParquetProperties; |
59 | | -import org.apache.parquet.column.impl.ColumnReadStoreImpl; |
60 | | -import org.apache.parquet.column.impl.ColumnWriteStoreV1; |
61 | 51 | import org.apache.parquet.column.page.DictionaryPage; |
62 | | -import org.apache.parquet.column.page.PageReader; |
63 | 52 | import org.apache.parquet.column.statistics.Statistics; |
64 | | -import org.apache.parquet.example.DummyRecordConverter; |
65 | 53 | import org.apache.parquet.hadoop.ParquetOutputFormat.JobSummaryLevel; |
66 | 54 | import org.apache.parquet.hadoop.metadata.ColumnPath; |
67 | 55 | import org.apache.parquet.format.Util; |
|
72 | 60 | import org.apache.parquet.hadoop.metadata.FileMetaData; |
73 | 61 | import org.apache.parquet.hadoop.metadata.GlobalMetaData; |
74 | 62 | import org.apache.parquet.hadoop.metadata.ParquetMetadata; |
75 | | -import org.apache.parquet.hadoop.util.BlocksCombiner; |
76 | 63 | import org.apache.parquet.hadoop.util.HadoopOutputFile; |
77 | 64 | import org.apache.parquet.hadoop.util.HadoopStreams; |
78 | 65 | import org.apache.parquet.internal.column.columnindex.ColumnIndex; |
@@ -669,116 +656,6 @@ public void appendFile(InputFile file) throws IOException { |
669 | 656 | } |
670 | 657 | } |
671 | 658 |
|
672 | | - public int merge(List<InputFile> inputFiles, CodecFactory.BytesCompressor compressor, String createdBy, long maxBlockSize) throws IOException { |
673 | | - List<ParquetFileReader> readers = getReaders(inputFiles); |
674 | | - try { |
675 | | - ByteBufferAllocator allocator = new HeapByteBufferAllocator(); |
676 | | - ColumnReadStoreImpl columnReadStore = new ColumnReadStoreImpl(null, new DummyRecordConverter(schema).getRootConverter(), schema, createdBy); |
677 | | - this.start(); |
678 | | - List<BlocksCombiner.SmallBlocksUnion> largeBlocks = BlocksCombiner.combineLargeBlocks(readers, maxBlockSize); |
679 | | - for (BlocksCombiner.SmallBlocksUnion smallBlocks : largeBlocks) { |
680 | | - for (int columnIndex = 0; columnIndex < schema.getColumns().size(); columnIndex++) { |
681 | | - ColumnDescriptor path = schema.getColumns().get(columnIndex); |
682 | | - ColumnChunkPageWriteStore store = new ColumnChunkPageWriteStore(compressor, schema, allocator, ParquetProperties.DEFAULT_COLUMN_INDEX_TRUNCATE_LENGTH); |
683 | | - ColumnWriteStoreV1 columnWriteStoreV1 = new ColumnWriteStoreV1(schema, store, ParquetProperties.builder().build()); |
684 | | - for (BlocksCombiner.SmallBlock smallBlock : smallBlocks.getBlocks()) { |
685 | | - ParquetFileReader parquetFileReader = smallBlock.getReader(); |
686 | | - try { |
687 | | - Optional<PageReader> columnChunkPageReader = parquetFileReader.readColumnInBlock(smallBlock.getBlockIndex(), path); |
688 | | - ColumnWriter columnWriter = columnWriteStoreV1.getColumnWriter(path); |
689 | | - if (columnChunkPageReader.isPresent()) { |
690 | | - ColumnReader columnReader = columnReadStore.newMemColumnReader(path, columnChunkPageReader.get()); |
691 | | - for (int i = 0; i < columnReader.getTotalValueCount(); i++) { |
692 | | - consumeTriplet(columnWriteStoreV1, columnWriter, columnReader); |
693 | | - } |
694 | | - } else { |
695 | | - MessageType inputFileSchema = parquetFileReader.getFileMetaData().getSchema(); |
696 | | - String[] parentPath = getExisingParentPath(path, inputFileSchema); |
697 | | - int def = parquetFileReader.getFileMetaData().getSchema().getMaxDefinitionLevel(parentPath); |
698 | | - int rep = parquetFileReader.getFileMetaData().getSchema().getMaxRepetitionLevel(parentPath); |
699 | | - for (int i = 0; i < parquetFileReader.getBlockMetaData(smallBlock.getBlockIndex()).getRowCount(); i++) { |
700 | | - columnWriter.writeNull(rep, def); |
701 | | - if (def == 0) { |
702 | | - // V1 pages also respect record boundaries so we have to mark them |
703 | | - columnWriteStoreV1.endRecord(); |
704 | | - } |
705 | | - } |
706 | | - } |
707 | | - } catch (Exception e) { |
708 | | - LOG.error("File {} is not readable", parquetFileReader.getFile(), e); |
709 | | - } |
710 | | - } |
711 | | - if (columnIndex == 0) { |
712 | | - this.startBlock(smallBlocks.getRowCount()); |
713 | | - } |
714 | | - columnWriteStoreV1.flush(); |
715 | | - store.flushToFileWriter(path, this); |
716 | | - } |
717 | | - this.endBlock(); |
718 | | - } |
719 | | - this.end(Collections.emptyMap()); |
720 | | - }finally { |
721 | | - BlocksCombiner.closeReaders(readers); |
722 | | - } |
723 | | - return 0; |
724 | | - } |
725 | | - |
726 | | - private String[] getExisingParentPath(ColumnDescriptor path, MessageType inputFileSchema) { |
727 | | - List<String> parentPath = Arrays.asList(path.getPath()); |
728 | | - while (parentPath.size() > 0 && !inputFileSchema.containsPath(parentPath.toArray(new String[parentPath.size()]))) { |
729 | | - parentPath = parentPath.subList(0, parentPath.size() - 1); |
730 | | - } |
731 | | - return parentPath.toArray(new String[parentPath.size()]); |
732 | | - } |
733 | | - |
734 | | - private List<ParquetFileReader> getReaders(List<InputFile> inputFiles) throws IOException { |
735 | | - List<ParquetFileReader> readers = new ArrayList<>(inputFiles.size()); |
736 | | - for (InputFile inputFile : inputFiles) { |
737 | | - readers.add(ParquetFileReader.open(inputFile)); |
738 | | - } |
739 | | - return readers; |
740 | | - } |
741 | | - |
742 | | - private void consumeTriplet(ColumnWriteStore columnWriteStore, ColumnWriter columnWriter, ColumnReader columnReader) { |
743 | | - int definitionLevel = columnReader.getCurrentDefinitionLevel(); |
744 | | - int repetitionLevel = columnReader.getCurrentRepetitionLevel(); |
745 | | - ColumnDescriptor column = columnReader.getDescriptor(); |
746 | | - PrimitiveType type = column.getPrimitiveType(); |
747 | | - if (definitionLevel < column.getMaxDefinitionLevel()) { |
748 | | - columnWriter.writeNull(repetitionLevel, definitionLevel); |
749 | | - } else { |
750 | | - switch (type.getPrimitiveTypeName()) { |
751 | | - case INT32: |
752 | | - columnWriter.write(columnReader.getInteger(), repetitionLevel, definitionLevel); |
753 | | - break; |
754 | | - case INT64: |
755 | | - columnWriter.write(columnReader.getLong(), repetitionLevel, definitionLevel); |
756 | | - break; |
757 | | - case BINARY: |
758 | | - case FIXED_LEN_BYTE_ARRAY: |
759 | | - case INT96: |
760 | | - columnWriter.write(columnReader.getBinary(), repetitionLevel, definitionLevel); |
761 | | - break; |
762 | | - case BOOLEAN: |
763 | | - columnWriter.write(columnReader.getBoolean(), repetitionLevel, definitionLevel); |
764 | | - break; |
765 | | - case FLOAT: |
766 | | - columnWriter.write(columnReader.getFloat(), repetitionLevel, definitionLevel); |
767 | | - break; |
768 | | - case DOUBLE: |
769 | | - columnWriter.write(columnReader.getDouble(), repetitionLevel, definitionLevel); |
770 | | - break; |
771 | | - default: |
772 | | - throw new IllegalArgumentException("Unknown primitive type " + type); |
773 | | - } |
774 | | - } |
775 | | - columnReader.consume(); |
776 | | - if (repetitionLevel == 0) { |
777 | | - // V1 pages also respect record boundaries so we have to mark them |
778 | | - columnWriteStore.endRecord(); |
779 | | - } |
780 | | - } |
781 | | - |
782 | 659 | /** |
783 | 660 | * @param file a file stream to read from |
784 | 661 | * @param rowGroups row groups to copy |
|
0 commit comments