Skip to content
76 changes: 57 additions & 19 deletions api/src/main/java/org/apache/iceberg/DataFile.java
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
import java.nio.ByteBuffer;
import java.util.List;
import java.util.Map;
import org.apache.iceberg.types.Types;
import org.apache.iceberg.types.Types.BinaryType;
import org.apache.iceberg.types.Types.IntegerType;
import org.apache.iceberg.types.Types.ListType;
Expand All @@ -37,31 +38,68 @@
* Interface for files listed in a table manifest.
*/
public interface DataFile {
// DataFile fields from ManifestEntry with new ids
Types.NestedField STATUS = optional(50, "status", Types.IntegerType.get());
Types.NestedField SNAPSHOT_ID = optional(51, "snapshot_id", Types.LongType.get());
Types.NestedField SEQUENCE_NUMBER = optional(52, "sequence_number", Types.LongType.get());

// original DataFile fields
Types.NestedField FILE_PATH = required(100, "file_path", StringType.get());
Types.NestedField FILE_FORMAT = required(101, "file_format", StringType.get());
Types.NestedField RECORD_COUNT = required(103, "record_count", LongType.get());
Types.NestedField FILE_SIZE = required(104, "file_size_in_bytes", LongType.get());
Types.NestedField COLUMN_SIZES = optional(108, "column_sizes", MapType.ofRequired(117, 118,
IntegerType.get(), LongType.get()));
Types.NestedField VALUE_COUNTS = optional(109, "value_counts", MapType.ofRequired(119, 120,
IntegerType.get(), LongType.get()));
Types.NestedField NULL_VALUE_COUNTS = optional(110, "null_value_counts", MapType.ofRequired(121, 122,
IntegerType.get(), LongType.get()));
Types.NestedField LOWER_BOUNDS = optional(125, "lower_bounds", MapType.ofRequired(126, 127,
IntegerType.get(), BinaryType.get()));
Types.NestedField UPPER_BOUNDS = optional(128, "upper_bounds", MapType.ofRequired(129, 130,
IntegerType.get(), BinaryType.get()));
Types.NestedField KEY_METADATA = optional(131, "key_metadata", BinaryType.get());
Types.NestedField SPLIT_OFFSETS = optional(132, "split_offsets", ListType.ofRequired(133, LongType.get()));
int PARTITION_ID = 102;
String PARTITION_NAME = "partition";
// NEXT ID TO ASSIGN: 134

static StructType getType(StructType partitionType) {
// IDs start at 100 to leave room for changes to ManifestEntry
return StructType.of(
required(100, "file_path", StringType.get()),
required(101, "file_format", StringType.get()),
required(102, "partition", partitionType),
required(103, "record_count", LongType.get()),
required(104, "file_size_in_bytes", LongType.get()),
required(105, "block_size_in_bytes", LongType.get()),
optional(108, "column_sizes", MapType.ofRequired(117, 118,
IntegerType.get(), LongType.get())),
optional(109, "value_counts", MapType.ofRequired(119, 120,
IntegerType.get(), LongType.get())),
optional(110, "null_value_counts", MapType.ofRequired(121, 122,
IntegerType.get(), LongType.get())),
optional(125, "lower_bounds", MapType.ofRequired(126, 127,
IntegerType.get(), BinaryType.get())),
optional(128, "upper_bounds", MapType.ofRequired(129, 130,
IntegerType.get(), BinaryType.get())),
optional(131, "key_metadata", BinaryType.get()),
optional(132, "split_offsets", ListType.ofRequired(133, LongType.get()))
// NEXT ID TO ASSIGN: 134
STATUS,
SNAPSHOT_ID,
SEQUENCE_NUMBER,
FILE_PATH,
FILE_FORMAT,
required(PARTITION_ID, PARTITION_NAME, partitionType),
RECORD_COUNT,
FILE_SIZE,
COLUMN_SIZES,
VALUE_COUNTS,
NULL_VALUE_COUNTS,
LOWER_BOUNDS,
UPPER_BOUNDS,
KEY_METADATA,
SPLIT_OFFSETS
);
}

/**
* @return the status of the file, whether EXISTING, ADDED, or DELETED
*/
FileStatus status();

/**
* @return id of the snapshot in which the file was added to the table
*/
Long snapshotId();

/**
* @return the sequence number of the snapshot in which the file was added to the table
*/
Long sequenceNumber();

/**
* @return fully qualified path to the file, suitable for constructing a Hadoop Path
*/
Expand Down
42 changes: 42 additions & 0 deletions api/src/main/java/org/apache/iceberg/FileStatus.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/

package org.apache.iceberg;

/**
* The status of a data or delete file in a manifest:
* 0 - existing (added in a different manifest)
* 1 - added to the table in the manifest
* 2 - deleted from the table in the manifest
*/
public enum FileStatus {
EXISTING(0),
ADDED(1),
DELETED(2);

private final int id;

FileStatus(int id) {
this.id = id;
}

public int id() {
return id;
}
}
15 changes: 15 additions & 0 deletions api/src/test/java/org/apache/iceberg/TestHelpers.java
Original file line number Diff line number Diff line change
Expand Up @@ -303,6 +303,21 @@ public TestDataFile(String path, StructLike partition, long recordCount,
this.upperBounds = upperBounds;
}

@Override
public FileStatus status() {
return FileStatus.ADDED;
}

@Override
public Long snapshotId() {
return 0L;
}

@Override
public Long sequenceNumber() {
return 0L;
}

@Override
public CharSequence path() {
return path;
Expand Down
9 changes: 1 addition & 8 deletions core/src/main/java/org/apache/iceberg/AllDataFilesTable.java
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,6 @@
import org.apache.iceberg.expressions.Expression;
import org.apache.iceberg.expressions.ResidualEvaluator;
import org.apache.iceberg.io.CloseableIterable;
import org.apache.iceberg.types.TypeUtil;
import org.apache.iceberg.util.ParallelIterable;
import org.apache.iceberg.util.ThreadPools;

Expand Down Expand Up @@ -66,13 +65,7 @@ public TableScan newScan() {

@Override
public Schema schema() {
Schema schema = new Schema(DataFile.getType(table.spec().partitionType()).fields());
if (table.spec().fields().size() < 1) {
// avoid returning an empty struct, which is not always supported. instead, drop the partition field (id 102)
return TypeUtil.selectNot(schema, Sets.newHashSet(102));
} else {
return schema;
}
return MetadataTables.filesTableSchema(table.spec().partitionType());
}

public static class AllDataFilesTableScan extends BaseAllMetadataTableScan {
Expand Down
10 changes: 1 addition & 9 deletions core/src/main/java/org/apache/iceberg/AllEntriesTable.java
Original file line number Diff line number Diff line change
Expand Up @@ -20,12 +20,10 @@
package org.apache.iceberg;

import com.google.common.collect.ImmutableMap;
import com.google.common.collect.Sets;
import java.util.Collection;
import org.apache.iceberg.expressions.Expression;
import org.apache.iceberg.expressions.ResidualEvaluator;
import org.apache.iceberg.io.CloseableIterable;
import org.apache.iceberg.types.TypeUtil;

/**
* A {@link Table} implementation that exposes a table's manifest entries as rows.
Expand Down Expand Up @@ -59,13 +57,7 @@ public TableScan newScan() {

@Override
public Schema schema() {
Schema schema = ManifestEntry.getSchema(table.spec().partitionType());
if (table.spec().fields().size() < 1) {
// avoid returning an empty struct, which is not always supported. instead, drop the partition field (id 102)
return TypeUtil.selectNot(schema, Sets.newHashSet(102));
} else {
return schema;
}
return MetadataTables.entriesTableSchema(table.spec().partitionType());
}

private static class Scan extends BaseAllMetadataTableScan {
Expand Down
10 changes: 1 addition & 9 deletions core/src/main/java/org/apache/iceberg/DataFilesTable.java
Original file line number Diff line number Diff line change
Expand Up @@ -21,13 +21,11 @@

import com.google.common.collect.ImmutableList;
import com.google.common.collect.ImmutableMap;
import com.google.common.collect.Sets;
import java.util.Collection;
import org.apache.iceberg.expressions.Expression;
import org.apache.iceberg.expressions.ResidualEvaluator;
import org.apache.iceberg.io.CloseableIterable;
import org.apache.iceberg.io.FileIO;
import org.apache.iceberg.types.TypeUtil;

/**
* A {@link Table} implementation that exposes a table's data files as rows.
Expand Down Expand Up @@ -58,13 +56,7 @@ public TableScan newScan() {

@Override
public Schema schema() {
Schema schema = new Schema(DataFile.getType(table.spec().partitionType()).fields());
if (table.spec().fields().size() < 1) {
// avoid returning an empty struct, which is not always supported. instead, drop the partition field (id 102)
return TypeUtil.selectNot(schema, Sets.newHashSet(102));
} else {
return schema;
}
return MetadataTables.filesTableSchema(table.spec().partitionType());
}

public static class FilesTableScan extends BaseTableScan {
Expand Down
Loading