Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
116 changes: 116 additions & 0 deletions api/src/main/java/org/apache/iceberg/ContentFile.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/

package org.apache.iceberg;

import java.nio.ByteBuffer;
import java.util.List;
import java.util.Map;

/**
* Superinterface of {@link DataFile} and {@link DeleteFile} that exposes common methods.
*
* @param <F> the concrete Java class of a ContentFile instance.
*/
public interface ContentFile<F> {
/**
* @return type of content stored in the file; one of DATA, POSITION_DELETES, or EQUALITY_DELETES
*/
FileContent content();

/**
* @return fully qualified path to the file, suitable for constructing a Hadoop Path
*/
CharSequence path();

/**
* @return format of the file
*/
FileFormat format();

/**
* @return partition for this file as a {@link StructLike}
*/
StructLike partition();

/**
* @return the number of top-level records in the file
*/
long recordCount();

/**
* @return the file size in bytes
*/
long fileSizeInBytes();

/**
* @return if collected, map from column ID to the size of the column in bytes, null otherwise
*/
Map<Integer, Long> columnSizes();

/**
* @return if collected, map from column ID to the count of its non-null values, null otherwise
*/
Map<Integer, Long> valueCounts();

/**
* @return if collected, map from column ID to its null value count, null otherwise
*/
Map<Integer, Long> nullValueCounts();

/**
* @return if collected, map from column ID to value lower bounds, null otherwise
*/
Map<Integer, ByteBuffer> lowerBounds();

/**
* @return if collected, map from column ID to value upper bounds, null otherwise
*/
Map<Integer, ByteBuffer> upperBounds();

/**
* @return metadata about how this file is encrypted, or null if the file is stored in plain
* text.
*/
ByteBuffer keyMetadata();

/**
* @return List of recommended split locations, if applicable, null otherwise.
* When available, this information is used for planning scan tasks whose boundaries
* are determined by these offsets. The returned list must be sorted in ascending order.
*/
List<Long> splitOffsets();


/**
* Copies this file. Manifest readers can reuse file instances; use
* this method to copy data when collecting files from tasks.
*
* @return a copy of this data file
*/
F copy();

/**
* Copies this file without file stats. Manifest readers can reuse file instances; use
* this method to copy data without stats when collecting files.
*
* @return a copy of this data file, without lower bounds, upper bounds, value counts, or null value counts
*/
F copyWithoutStats();
}
87 changes: 3 additions & 84 deletions api/src/main/java/org/apache/iceberg/DataFile.java
Original file line number Diff line number Diff line change
Expand Up @@ -19,9 +19,6 @@

package org.apache.iceberg;

import java.nio.ByteBuffer;
import java.util.List;
import java.util.Map;
import org.apache.iceberg.types.Types;
import org.apache.iceberg.types.Types.BinaryType;
import org.apache.iceberg.types.Types.IntegerType;
Expand All @@ -35,9 +32,9 @@
import static org.apache.iceberg.types.Types.NestedField.required;

/**
* Interface for files listed in a table manifest.
* Interface for data files listed in a table manifest.
*/
public interface DataFile {
public interface DataFile extends ContentFile<DataFile> {
// fields for adding delete data files
Types.NestedField CONTENT = optional(134, "content", IntegerType.get(),
"Contents of the file: 0=data, 1=position deletes, 2=equality deletes");
Expand Down Expand Up @@ -86,86 +83,8 @@ static StructType getType(StructType partitionType) {
/**
* @return the content stored in the file; one of DATA, POSITION_DELETES, or EQUALITY_DELETES
*/
@Override
default FileContent content() {
return FileContent.DATA;
}

/**
* @return fully qualified path to the file, suitable for constructing a Hadoop Path
*/
CharSequence path();

/**
* @return format of the data file
*/
FileFormat format();

/**
* @return partition data for this file as a {@link StructLike}
*/
StructLike partition();

/**
* @return the number of top-level records in the data file
*/
long recordCount();

/**
* @return the data file size in bytes
*/
long fileSizeInBytes();

/**
* @return if collected, map from column ID to the size of the column in bytes, null otherwise
*/
Map<Integer, Long> columnSizes();

/**
* @return if collected, map from column ID to the count of its non-null values, null otherwise
*/
Map<Integer, Long> valueCounts();

/**
* @return if collected, map from column ID to its null value count, null otherwise
*/
Map<Integer, Long> nullValueCounts();

/**
* @return if collected, map from column ID to value lower bounds, null otherwise
*/
Map<Integer, ByteBuffer> lowerBounds();

/**
* @return if collected, map from column ID to value upper bounds, null otherwise
*/
Map<Integer, ByteBuffer> upperBounds();

/**
* @return metadata about how this file is encrypted, or null if the file is stored in plain
* text.
*/
ByteBuffer keyMetadata();

/**
* @return List of recommended split locations, if applicable, null otherwise.
* When available, this information is used for planning scan tasks whose boundaries
* are determined by these offsets. The returned list must be sorted in ascending order.
*/
List<Long> splitOffsets();

/**
* Copies this {@link DataFile data file}. Manifest readers can reuse data file instances; use
* this method to copy data when collecting files from tasks.
*
* @return a copy of this data file
*/
DataFile copy();

/**
* Copies this {@link DataFile data file} without file stats. Manifest readers can reuse data file instances; use
* this method to copy data without stats when collecting files.
*
* @return a copy of this data file, without lower bounds, upper bounds, value counts, or null value counts
*/
DataFile copyWithoutStats();
}
37 changes: 37 additions & 0 deletions api/src/main/java/org/apache/iceberg/DeleteFile.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/

package org.apache.iceberg;

import java.util.List;

/**
* Interface for delete files listed in a table delete manifest.
*/
public interface DeleteFile extends ContentFile<DeleteFile> {
/**
* @return List of recommended split locations, if applicable, null otherwise.
* When available, this information is used for planning scan tasks whose boundaries
* are determined by these offsets. The returned list must be sorted in ascending order.
*/
@Override
default List<Long> splitOffsets() {
return null;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
import java.util.Map;
import java.util.Set;
import java.util.stream.Collectors;
import org.apache.iceberg.ContentFile;
import org.apache.iceberg.DataFile;
import org.apache.iceberg.Schema;
import org.apache.iceberg.expressions.ExpressionVisitors.BoundExpressionVisitor;
Expand All @@ -40,7 +41,7 @@
* <p>
* This evaluation is inclusive: it returns true if a file may match and false if it cannot match.
* <p>
* Files are passed to {@link #eval(DataFile)}, which returns true if the file may contain matching
* Files are passed to {@link #eval(ContentFile)}, which returns true if the file may contain matching
* rows and false if the file cannot contain matching rows. Files may be skipped if and only if the
* return value of {@code eval} is false.
*/
Expand Down Expand Up @@ -70,7 +71,7 @@ public InclusiveMetricsEvaluator(Schema schema, Expression unbound, boolean case
* @param file a data file
* @return false if the file cannot contain rows that match the expression, true otherwise.
*/
public boolean eval(DataFile file) {
public boolean eval(ContentFile<?> file) {
// TODO: detect the case where a column is missing from the file using file's max field id.
return visitor().eval(file);
}
Expand All @@ -84,7 +85,7 @@ private class MetricsEvalVisitor extends BoundExpressionVisitor<Boolean> {
private Map<Integer, ByteBuffer> lowerBounds = null;
private Map<Integer, ByteBuffer> upperBounds = null;

private boolean eval(DataFile file) {
private boolean eval(ContentFile<?> file) {
if (file.recordCount() == 0) {
return ROWS_CANNOT_MATCH;
}
Expand Down
Loading