Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 19 additions & 1 deletion api/src/main/java/org/apache/iceberg/ManifestFile.java
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,10 @@ public interface ManifestFile {
required(509, "contains_null", Types.BooleanType.get()),
optional(510, "lower_bound", Types.BinaryType.get()), // null if no non-null values
optional(511, "upper_bound", Types.BinaryType.get())
))));
))),
optional(512, "added_rows_count", Types.LongType.get()),
optional(513, "existing_rows_count", Types.LongType.get()),
optional(514, "deleted_rows_count", Types.LongType.get()));
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we add these up by the data files counts?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@rdblue, not sure I got. Do you mean whether we can avoid storing these and add them up?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No, I meant that we don't need to add these at the end of the schema. We can add them up by the file count columns, like you do with all of the accessor methods.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, let me update this.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done.

Copy link
Contributor Author

@aokolnychyi aokolnychyi Jan 21, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hm, reordering actually led to failures while writing manifest lists as GenericAvroWriter doesn't take into account field ids.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@rdblue, do we want to build something as ProjectionDatumReader for the write side?

Copy link
Contributor Author

@aokolnychyi aokolnychyi Jan 23, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@rdblue, I reverted this change. I propose to merge this PR as is and create a follow-up issue to implement writers that take into account field ids.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Writers don't use field IDs?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@rdblue, yes, GenericAvroWriter doesn't. I think SparkAvroWriter does respect field ids.


static Schema schema() {
return SCHEMA;
Expand Down Expand Up @@ -82,6 +85,11 @@ default boolean hasAddedFiles() {
*/
Integer addedFilesCount();

/**
* @return the total number of rows in all data files with status ADDED in the manifest file
*/
Long addedRowsCount();

/**
* Returns true if the manifest contains EXISTING entries or if the count is not known.
*
Expand All @@ -96,6 +104,11 @@ default boolean hasExistingFiles() {
*/
Integer existingFilesCount();

/**
* @return the total number of rows in all data files with status EXISTING in the manifest file
*/
Long existingRowsCount();

/**
* Returns true if the manifest contains DELETED entries or if the count is not known.
*
Expand All @@ -110,6 +123,11 @@ default boolean hasDeletedFiles() {
*/
Integer deletedFilesCount();

/**
* @return the total number of rows in all data files with status DELETED in the manifest file
*/
Long deletedRowsCount();

/**
* Returns a list of {@link PartitionFieldSummary partition field summaries}.
* <p>
Expand Down
38 changes: 38 additions & 0 deletions api/src/test/java/org/apache/iceberg/TestHelpers.java
Original file line number Diff line number Diff line change
Expand Up @@ -165,8 +165,11 @@ public static class TestManifestFile implements ManifestFile {
private final int specId;
private final Long snapshotId;
private final Integer addedFiles;
private final Long addedRows;
private final Integer existingFiles;
private final Long existingRows;
private final Integer deletedFiles;
private final Long deletedRows;
private final List<PartitionFieldSummary> partitions;

public TestManifestFile(String path, long length, int specId, Long snapshotId,
Expand All @@ -177,8 +180,28 @@ public TestManifestFile(String path, long length, int specId, Long snapshotId,
this.specId = specId;
this.snapshotId = snapshotId;
this.addedFiles = addedFiles;
this.addedRows = null;
this.existingFiles = existingFiles;
this.existingRows = null;
this.deletedFiles = deletedFiles;
this.deletedRows = null;
this.partitions = partitions;
}

public TestManifestFile(String path, long length, int specId, Long snapshotId,
Integer addedFiles, Long addedRows, Integer existingFiles,
Long existingRows, Integer deletedFiles, Long deletedRows,
List<PartitionFieldSummary> partitions) {
this.path = path;
this.length = length;
this.specId = specId;
this.snapshotId = snapshotId;
this.addedFiles = addedFiles;
this.addedRows = addedRows;
this.existingFiles = existingFiles;
this.existingRows = existingRows;
this.deletedFiles = deletedFiles;
this.deletedRows = deletedRows;
this.partitions = partitions;
}

Expand Down Expand Up @@ -207,16 +230,31 @@ public Integer addedFilesCount() {
return addedFiles;
}

@Override
public Long addedRowsCount() {
return addedRows;
}

@Override
public Integer existingFilesCount() {
return existingFiles;
}

@Override
public Long existingRowsCount() {
return existingRows;
}

@Override
public Integer deletedFilesCount() {
return deletedFiles;
}

@Override
public Long deletedRowsCount() {
return deletedRows;
}

@Override
public List<PartitionFieldSummary> partitions() {
return partitions;
Expand Down
64 changes: 64 additions & 0 deletions core/src/main/java/org/apache/iceberg/GenericManifestFile.java
Original file line number Diff line number Diff line change
Expand Up @@ -47,8 +47,11 @@ public class GenericManifestFile
private int specId = -1;
private Long snapshotId = null;
private Integer addedFilesCount = null;
private Long addedRowsCount = null;
private Integer existingFilesCount = null;
private Long existingRowsCount = null;
private Integer deletedFilesCount = null;
private Long deletedRowsCount = null;
private List<PartitionFieldSummary> partitions = null;

/**
Expand Down Expand Up @@ -87,8 +90,11 @@ public GenericManifestFile(org.apache.avro.Schema avroSchema) {
this.specId = specId;
this.snapshotId = null;
this.addedFilesCount = null;
this.addedRowsCount = null;
this.existingFilesCount = null;
this.existingRowsCount = null;
this.deletedFilesCount = null;
this.deletedRowsCount = null;
this.partitions = null;
this.fromProjectionPos = null;
}
Expand All @@ -102,8 +108,30 @@ public GenericManifestFile(String path, long length, int specId, long snapshotId
this.specId = specId;
this.snapshotId = snapshotId;
this.addedFilesCount = addedFilesCount;
this.addedRowsCount = null;
this.existingFilesCount = existingFilesCount;
this.existingRowsCount = null;
this.deletedFilesCount = deletedFilesCount;
this.deletedRowsCount = null;
this.partitions = partitions;
this.fromProjectionPos = null;
}

public GenericManifestFile(String path, long length, int specId, long snapshotId,
int addedFilesCount, long addedRowsCount, int existingFilesCount,
long existingRowsCount, int deletedFilesCount, long deletedRowsCount,
List<PartitionFieldSummary> partitions) {
this.avroSchema = AVRO_SCHEMA;
this.manifestPath = path;
this.length = length;
this.specId = specId;
this.snapshotId = snapshotId;
this.addedFilesCount = addedFilesCount;
this.addedRowsCount = addedRowsCount;
this.existingFilesCount = existingFilesCount;
this.existingRowsCount = existingRowsCount;
this.deletedFilesCount = deletedFilesCount;
this.deletedRowsCount = deletedRowsCount;
this.partitions = partitions;
this.fromProjectionPos = null;
}
Expand All @@ -120,8 +148,11 @@ private GenericManifestFile(GenericManifestFile toCopy) {
this.specId = toCopy.specId;
this.snapshotId = toCopy.snapshotId;
this.addedFilesCount = toCopy.addedFilesCount;
this.addedRowsCount = toCopy.addedRowsCount;
this.existingFilesCount = toCopy.existingFilesCount;
this.existingRowsCount = toCopy.existingRowsCount;
this.deletedFilesCount = toCopy.deletedFilesCount;
this.deletedRowsCount = toCopy.deletedRowsCount;
this.partitions = ImmutableList.copyOf(Iterables.transform(toCopy.partitions, PartitionFieldSummary::copy));
this.fromProjectionPos = toCopy.fromProjectionPos;
}
Expand Down Expand Up @@ -170,16 +201,31 @@ public Integer addedFilesCount() {
return addedFilesCount;
}

@Override
public Long addedRowsCount() {
return addedRowsCount;
}

@Override
public Integer existingFilesCount() {
return existingFilesCount;
}

@Override
public Long existingRowsCount() {
return existingRowsCount;
}

@Override
public Integer deletedFilesCount() {
return deletedFilesCount;
}

@Override
public Long deletedRowsCount() {
return deletedRowsCount;
}

@Override
public List<PartitionFieldSummary> partitions() {
return partitions;
Expand Down Expand Up @@ -219,6 +265,12 @@ public Object get(int i) {
return deletedFilesCount;
case 7:
return partitions;
case 8:
return addedRowsCount;
case 9:
return existingRowsCount;
case 10:
return deletedRowsCount;
default:
throw new UnsupportedOperationException("Unknown field ordinal: " + pos);
}
Expand Down Expand Up @@ -258,6 +310,15 @@ public <T> void set(int i, T value) {
case 7:
this.partitions = (List<PartitionFieldSummary>) value;
return;
case 8:
this.addedRowsCount = (Long) value;
return;
case 9:
this.existingRowsCount = (Long) value;
return;
case 10:
this.deletedRowsCount = (Long) value;
return;
default:
// ignore the object, it must be from a newer version of the format
}
Expand Down Expand Up @@ -302,8 +363,11 @@ public String toString() {
.add("partition_spec_id", specId)
.add("added_snapshot_id", snapshotId)
.add("added_data_files_count", addedFilesCount)
.add("added_rows_count", addedRowsCount)
.add("existing_data_files_count", existingFilesCount)
.add("existing_rows_count", existingRowsCount)
.add("deleted_data_files_count", deletedFilesCount)
.add("deleted_rows_count", deletedRowsCount)
.add("partitions", partitions)
.toString();
}
Expand Down
8 changes: 7 additions & 1 deletion core/src/main/java/org/apache/iceberg/ManifestWriter.java
Original file line number Diff line number Diff line change
Expand Up @@ -105,8 +105,11 @@ public static ManifestWriter write(PartitionSpec spec, OutputFile outputFile) {

private boolean closed = false;
private int addedFiles = 0;
private long addedRows = 0L;
private int existingFiles = 0;
private long existingRows = 0L;
private int deletedFiles = 0;
private long deletedRows = 0L;

ManifestWriter(PartitionSpec spec, OutputFile file, long snapshotId) {
this.file = file;
Expand All @@ -121,12 +124,15 @@ void addEntry(ManifestEntry entry) {
switch (entry.status()) {
case ADDED:
addedFiles += 1;
addedRows += entry.file().recordCount();
break;
case EXISTING:
existingFiles += 1;
existingRows += entry.file().recordCount();
break;
case DELETED:
deletedFiles += 1;
deletedRows += entry.file().recordCount();
break;
}
stats.update(entry.file().partition());
Expand Down Expand Up @@ -195,7 +201,7 @@ public long length() {
public ManifestFile toManifestFile() {
Preconditions.checkState(closed, "Cannot build ManifestFile, writer is not closed");
return new GenericManifestFile(file.location(), writer.length(), specId, snapshotId,
addedFiles, existingFiles, deletedFiles, stats.summaries());
addedFiles, addedRows, existingFiles, existingRows, deletedFiles, deletedRows, stats.summaries());
}

@Override
Expand Down
9 changes: 8 additions & 1 deletion core/src/main/java/org/apache/iceberg/SnapshotProducer.java
Original file line number Diff line number Diff line change
Expand Up @@ -312,8 +312,11 @@ private static ManifestFile addMetadata(TableOperations ops, ManifestFile manife
ops.io().newInputFile(manifest.path()), ops.current().specsById())) {
PartitionSummary stats = new PartitionSummary(ops.current().spec(manifest.partitionSpecId()));
int addedFiles = 0;
long addedRows = 0L;
int existingFiles = 0;
long existingRows = 0L;
int deletedFiles = 0;
long deletedRows = 0L;

Long snapshotId = null;
long maxSnapshotId = Long.MIN_VALUE;
Expand All @@ -325,15 +328,18 @@ private static ManifestFile addMetadata(TableOperations ops, ManifestFile manife
switch (entry.status()) {
case ADDED:
addedFiles += 1;
addedRows += entry.file().recordCount();
if (snapshotId == null) {
snapshotId = entry.snapshotId();
}
break;
case EXISTING:
existingFiles += 1;
existingRows += entry.file().recordCount();
break;
case DELETED:
deletedFiles += 1;
deletedRows += entry.file().recordCount();
if (snapshotId == null) {
snapshotId = entry.snapshotId();
}
Expand All @@ -349,7 +355,8 @@ private static ManifestFile addMetadata(TableOperations ops, ManifestFile manife
}

return new GenericManifestFile(manifest.path(), manifest.length(), manifest.partitionSpecId(),
snapshotId, addedFiles, existingFiles, deletedFiles, stats.summaries());
snapshotId, addedFiles, addedRows, existingFiles, existingRows, deletedFiles, deletedRows,
stats.summaries());

} catch (IOException e) {
throw new RuntimeIOException(e, "Failed to read manifest: %s", manifest.path());
Expand Down
Loading