From a9db685f4bb2683754a95048d100a9f0898607d2 Mon Sep 17 00:00:00 2001 From: Steven Wu Date: Wed, 30 Nov 2022 08:27:42 -0800 Subject: [PATCH 01/12] core: add JSON parser for ContentFile and FileScanTask --- .../java/org/apache/iceberg/FileScanTask.java | 5 + .../apache/iceberg/BaseContentScanTask.java | 15 +- .../org/apache/iceberg/BaseFileScanTask.java | 5 + .../org/apache/iceberg/ContentFileParser.java | 286 ++++++++++++++++++ .../java/org/apache/iceberg/DataFiles.java | 13 +- .../apache/iceberg/FileScanTaskParser.java | 141 +++++++++ .../org/apache/iceberg/GenericDataFile.java | 3 +- .../org/apache/iceberg/TableTestBase.java | 2 +- .../apache/iceberg/TestContentFileParser.java | 229 ++++++++++++++ .../iceberg/TestFileScanTaskParser.java | 89 ++++++ .../iceberg/TestManifestWriterVersions.java | 2 +- 11 files changed, 785 insertions(+), 5 deletions(-) create mode 100644 core/src/main/java/org/apache/iceberg/ContentFileParser.java create mode 100644 core/src/main/java/org/apache/iceberg/FileScanTaskParser.java create mode 100644 core/src/test/java/org/apache/iceberg/TestContentFileParser.java create mode 100644 core/src/test/java/org/apache/iceberg/TestFileScanTaskParser.java diff --git a/api/src/main/java/org/apache/iceberg/FileScanTask.java b/api/src/main/java/org/apache/iceberg/FileScanTask.java index d99d924370ad..5fb4b55459e3 100644 --- a/api/src/main/java/org/apache/iceberg/FileScanTask.java +++ b/api/src/main/java/org/apache/iceberg/FileScanTask.java @@ -29,6 +29,11 @@ public interface FileScanTask extends ContentScanTask, SplittableScanT */ List deletes(); + /** Return the schema for this file scan task. */ + default Schema schema() { + throw new UnsupportedOperationException("Does not support schema getter"); + } + @Override default long sizeBytes() { return length() + deletes().stream().mapToLong(ContentFile::fileSizeInBytes).sum(); diff --git a/core/src/main/java/org/apache/iceberg/BaseContentScanTask.java b/core/src/main/java/org/apache/iceberg/BaseContentScanTask.java index e15b2b3f85d6..1521133c6466 100644 --- a/core/src/main/java/org/apache/iceberg/BaseContentScanTask.java +++ b/core/src/main/java/org/apache/iceberg/BaseContentScanTask.java @@ -34,6 +34,7 @@ abstract class BaseContentScanTask, F extends C private final String specString; private final ResidualEvaluator residuals; + private transient volatile Schema schema = null; private transient volatile PartitionSpec spec = null; BaseContentScanTask(F file, String schemaString, String specString, ResidualEvaluator residuals) { @@ -52,12 +53,24 @@ public F file() { return file; } + protected Schema schema() { + if (schema == null) { + synchronized (this) { + if (schema == null) { + this.schema = SchemaParser.fromJson(schemaString); + } + } + } + + return schema; + } + @Override public PartitionSpec spec() { if (spec == null) { synchronized (this) { if (spec == null) { - this.spec = PartitionSpecParser.fromJson(SchemaParser.fromJson(schemaString), specString); + this.spec = PartitionSpecParser.fromJson(schema(), specString); } } } diff --git a/core/src/main/java/org/apache/iceberg/BaseFileScanTask.java b/core/src/main/java/org/apache/iceberg/BaseFileScanTask.java index 2d7258be717a..bff2d724f7d7 100644 --- a/core/src/main/java/org/apache/iceberg/BaseFileScanTask.java +++ b/core/src/main/java/org/apache/iceberg/BaseFileScanTask.java @@ -53,6 +53,11 @@ public List deletes() { return ImmutableList.copyOf(deletes); } + @Override + public Schema schema() { + return super.schema(); + } + @VisibleForTesting static final class SplitScanTask implements FileScanTask, MergeableScanTask { private final long len; diff --git a/core/src/main/java/org/apache/iceberg/ContentFileParser.java b/core/src/main/java/org/apache/iceberg/ContentFileParser.java new file mode 100644 index 000000000000..c777fc798ed6 --- /dev/null +++ b/core/src/main/java/org/apache/iceberg/ContentFileParser.java @@ -0,0 +1,286 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg; + +import com.fasterxml.jackson.core.JsonGenerator; +import com.fasterxml.jackson.databind.JsonNode; +import java.io.IOException; +import java.nio.ByteBuffer; +import java.util.List; +import java.util.Map; +import org.apache.iceberg.relocated.com.google.common.base.Preconditions; +import org.apache.iceberg.util.ArrayUtil; +import org.apache.iceberg.util.JsonUtil; + +class ContentFileParser { + private static final String SPEC_ID = "spec-id"; + private static final String CONTENT = "content"; + private static final String FILE_PATH = "file-path"; + private static final String FILE_FORMAT = "file-format"; + private static final String PARTITION = "partition"; + private static final String RECORD_COUNT = "record-count"; + private static final String FILE_SIZE = "file-size-in-bytes"; + private static final String COLUMN_SIZES = "column-sizes"; + private static final String VALUE_COUNTS = "value-counts"; + private static final String NULL_VALUE_COUNTS = "null-value-counts"; + private static final String NAN_VALUE_COUNTS = "nan-value-counts"; + private static final String LOWER_BOUNDS = "lower-bounds"; + private static final String UPPER_BOUNDS = "upper-bounds"; + private static final String KEY_METADATA = "key-metadata"; + private static final String SPLIT_OFFSETS = "split-offsets"; + private static final String EQUALITY_IDS = "equality-ids"; + private static final String SORT_ORDER_ID = "sort-order-id"; + + private final PartitionSpec spec; + + ContentFileParser(PartitionSpec spec) { + this.spec = spec; + } + + private boolean hasPartitionData(StructLike partitionData) { + return partitionData != null && partitionData.size() > 0; + } + + void toJson(ContentFile contentFile, JsonGenerator generator) throws IOException { + generator.writeStartObject(); + + // ignore the ordinal position (ContentFile#pos) of the file in a manifest, + // as it isn't used and BaseFile constructor doesn't support it. + + Preconditions.checkArgument( + contentFile.specId() == spec.specId(), + "Partition spec id mismatch: expected = %s, actual = %s", + spec.specId(), + contentFile.specId()); + generator.writeNumberField(SPEC_ID, contentFile.specId()); + + generator.writeStringField(CONTENT, contentFile.content().name()); + generator.writeStringField(FILE_PATH, contentFile.path().toString()); + generator.writeStringField(FILE_FORMAT, contentFile.format().name()); + + Preconditions.checkArgument( + spec.isPartitioned() == hasPartitionData(contentFile.partition()), + "Invalid data file: partition data (%s) doesn't match the expected (%s)", + hasPartitionData(contentFile.partition()), + spec.isPartitioned()); + if (contentFile.partition() != null) { + generator.writeFieldName(PARTITION); + SingleValueParser.toJson(spec.partitionType(), contentFile.partition(), generator); + } + + generator.writeNumberField(RECORD_COUNT, contentFile.recordCount()); + generator.writeNumberField(FILE_SIZE, contentFile.fileSizeInBytes()); + + if (contentFile.columnSizes() != null) { + generator.writeFieldName(COLUMN_SIZES); + SingleValueParser.toJson(DataFile.COLUMN_SIZES.type(), contentFile.columnSizes(), generator); + } + + if (contentFile.valueCounts() != null) { + generator.writeFieldName(VALUE_COUNTS); + SingleValueParser.toJson(DataFile.VALUE_COUNTS.type(), contentFile.valueCounts(), generator); + } + + if (contentFile.nullValueCounts() != null) { + generator.writeFieldName(NULL_VALUE_COUNTS); + SingleValueParser.toJson( + DataFile.NULL_VALUE_COUNTS.type(), contentFile.nullValueCounts(), generator); + } + + if (contentFile.nullValueCounts() != null) { + generator.writeFieldName(NAN_VALUE_COUNTS); + SingleValueParser.toJson( + DataFile.NAN_VALUE_COUNTS.type(), contentFile.nanValueCounts(), generator); + } + + if (contentFile.lowerBounds() != null) { + generator.writeFieldName(LOWER_BOUNDS); + SingleValueParser.toJson(DataFile.LOWER_BOUNDS.type(), contentFile.lowerBounds(), generator); + } + + if (contentFile.upperBounds() != null) { + generator.writeFieldName(UPPER_BOUNDS); + SingleValueParser.toJson(DataFile.UPPER_BOUNDS.type(), contentFile.upperBounds(), generator); + } + + if (contentFile.keyMetadata() != null) { + generator.writeFieldName(KEY_METADATA); + SingleValueParser.toJson(DataFile.KEY_METADATA.type(), contentFile.keyMetadata(), generator); + } + + if (contentFile.splitOffsets() != null) { + generator.writeFieldName(SPLIT_OFFSETS); + SingleValueParser.toJson( + DataFile.SPLIT_OFFSETS.type(), contentFile.splitOffsets(), generator); + } + + if (contentFile.equalityFieldIds() != null) { + generator.writeFieldName(EQUALITY_IDS); + SingleValueParser.toJson( + DataFile.EQUALITY_IDS.type(), contentFile.equalityFieldIds(), generator); + } + + if (contentFile.sortOrderId() != null) { + generator.writeNumberField(SORT_ORDER_ID, contentFile.sortOrderId()); + } + + generator.writeEndObject(); + } + + ContentFile fromJson(JsonNode jsonNode) { + Preconditions.checkArgument( + jsonNode.isObject(), "Cannot parse content file from a non-object: %s", jsonNode); + + int specId = JsonUtil.getInt(SPEC_ID, jsonNode); + FileContent fileContent = FileContent.valueOf(JsonUtil.getString(CONTENT, jsonNode)); + String filePath = JsonUtil.getString(FILE_PATH, jsonNode); + FileFormat fileFormat = FileFormat.valueOf(JsonUtil.getString(FILE_FORMAT, jsonNode)); + + PartitionData partitionData = null; + if (jsonNode.has(PARTITION)) { + partitionData = new PartitionData(spec.partitionType()); + StructLike structLike = + (StructLike) SingleValueParser.fromJson(spec.partitionType(), jsonNode.get(PARTITION)); + Preconditions.checkState( + partitionData.size() == structLike.size(), + "Invalid partition data size: expected = %s, actual = %s", + partitionData.size(), + structLike.size()); + for (int pos = 0; pos < partitionData.size(); ++pos) { + Class javaClass = spec.partitionType().fields().get(pos).type().typeId().javaClass(); + partitionData.set(pos, structLike.get(pos, javaClass)); + } + } + + long fileSizeInBytes = JsonUtil.getLong(FILE_SIZE, jsonNode); + Metrics metrics = toMetrics(jsonNode); + + ByteBuffer keyMetadata = null; + if (jsonNode.has(KEY_METADATA)) { + keyMetadata = + (ByteBuffer) + SingleValueParser.fromJson(DataFile.KEY_METADATA.type(), jsonNode.get(KEY_METADATA)); + } + + List splitOffsets = null; + if (jsonNode.has(SPLIT_OFFSETS)) { + splitOffsets = + (List) + SingleValueParser.fromJson( + DataFile.SPLIT_OFFSETS.type(), jsonNode.get(SPLIT_OFFSETS)); + } + + int[] equalityFieldIds = null; + if (jsonNode.has(EQUALITY_IDS)) { + equalityFieldIds = + ArrayUtil.toIntArray( + (List) + SingleValueParser.fromJson( + DataFile.EQUALITY_IDS.type(), jsonNode.get(EQUALITY_IDS))); + } + + Integer sortOrderId = null; + if (jsonNode.has(SORT_ORDER_ID)) { + sortOrderId = JsonUtil.getInt(SORT_ORDER_ID, jsonNode); + } + + if (fileContent == FileContent.DATA) { + return new GenericDataFile( + specId, + filePath, + fileFormat, + partitionData, + fileSizeInBytes, + metrics, + keyMetadata, + splitOffsets, + equalityFieldIds, + sortOrderId); + } else { + return new GenericDeleteFile( + specId, + fileContent, + filePath, + fileFormat, + partitionData, + fileSizeInBytes, + metrics, + equalityFieldIds, + sortOrderId, + keyMetadata); + } + } + + private Metrics toMetrics(JsonNode jsonNode) { + long recordCount = JsonUtil.getLong(RECORD_COUNT, jsonNode); + + Map columnSizes = null; + if (jsonNode.has(COLUMN_SIZES)) { + columnSizes = + (Map) + SingleValueParser.fromJson(DataFile.COLUMN_SIZES.type(), jsonNode.get(COLUMN_SIZES)); + } + + Map valueCounts = null; + if (jsonNode.has(VALUE_COUNTS)) { + valueCounts = + (Map) + SingleValueParser.fromJson(DataFile.VALUE_COUNTS.type(), jsonNode.get(VALUE_COUNTS)); + } + + Map nullValueCounts = null; + if (jsonNode.has(NULL_VALUE_COUNTS)) { + nullValueCounts = + (Map) + SingleValueParser.fromJson( + DataFile.NULL_VALUE_COUNTS.type(), jsonNode.get(NULL_VALUE_COUNTS)); + } + + Map nanValueCounts = null; + if (jsonNode.has(NAN_VALUE_COUNTS)) { + nanValueCounts = + (Map) + SingleValueParser.fromJson( + DataFile.NAN_VALUE_COUNTS.type(), jsonNode.get(NAN_VALUE_COUNTS)); + } + + Map lowerBounds = null; + if (jsonNode.has(LOWER_BOUNDS)) { + lowerBounds = + (Map) + SingleValueParser.fromJson(DataFile.LOWER_BOUNDS.type(), jsonNode.get(LOWER_BOUNDS)); + } + + Map upperBounds = null; + if (jsonNode.has(UPPER_BOUNDS)) { + upperBounds = + (Map) + SingleValueParser.fromJson(DataFile.UPPER_BOUNDS.type(), jsonNode.get(UPPER_BOUNDS)); + } + + return new Metrics( + recordCount, + columnSizes, + valueCounts, + nullValueCounts, + nanValueCounts, + lowerBounds, + upperBounds); + } +} diff --git a/core/src/main/java/org/apache/iceberg/DataFiles.java b/core/src/main/java/org/apache/iceberg/DataFiles.java index ef95c0bdf632..7d6c34d00bb9 100644 --- a/core/src/main/java/org/apache/iceberg/DataFiles.java +++ b/core/src/main/java/org/apache/iceberg/DataFiles.java @@ -29,6 +29,7 @@ import org.apache.iceberg.relocated.com.google.common.base.Preconditions; import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; import org.apache.iceberg.types.Conversions; +import org.apache.iceberg.util.ArrayUtil; import org.apache.iceberg.util.ByteBuffers; public class DataFiles { @@ -123,7 +124,6 @@ public static class Builder { private FileFormat format = null; private long recordCount = -1L; private long fileSizeInBytes = -1L; - private Integer sortOrderId = SortOrder.unsorted().orderId(); // optional fields private Map columnSizes = null; @@ -134,6 +134,8 @@ public static class Builder { private Map upperBounds = null; private ByteBuffer keyMetadata = null; private List splitOffsets = null; + private List equalityFieldIds = null; + private Integer sortOrderId = SortOrder.unsorted().orderId(); public Builder(PartitionSpec spec) { this.spec = spec; @@ -269,6 +271,14 @@ public Builder withSplitOffsets(List offsets) { return this; } + public Builder withEqualityFieldIds(List equalityIds) { + if (equalityIds != null) { + this.equalityFieldIds = copyList(equalityIds); + } + + return this; + } + public Builder withEncryptionKeyMetadata(ByteBuffer newKeyMetadata) { this.keyMetadata = newKeyMetadata; return this; @@ -310,6 +320,7 @@ public DataFile build() { upperBounds), keyMetadata, splitOffsets, + ArrayUtil.toIntArray(equalityFieldIds), sortOrderId); } } diff --git a/core/src/main/java/org/apache/iceberg/FileScanTaskParser.java b/core/src/main/java/org/apache/iceberg/FileScanTaskParser.java new file mode 100644 index 000000000000..6399c455af2b --- /dev/null +++ b/core/src/main/java/org/apache/iceberg/FileScanTaskParser.java @@ -0,0 +1,141 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg; + +import com.fasterxml.jackson.core.JsonGenerator; +import com.fasterxml.jackson.databind.JsonNode; +import com.github.benmanes.caffeine.cache.Caffeine; +import com.github.benmanes.caffeine.cache.LoadingCache; +import java.io.IOException; +import java.io.StringWriter; +import java.io.UncheckedIOException; +import org.apache.iceberg.expressions.Expression; +import org.apache.iceberg.expressions.ExpressionParser; +import org.apache.iceberg.expressions.Expressions; +import org.apache.iceberg.expressions.ResidualEvaluator; +import org.apache.iceberg.relocated.com.google.common.base.Preconditions; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; +import org.apache.iceberg.util.JsonUtil; + +public class FileScanTaskParser { + private static final String SCHEMA = "schema"; + private static final String SPEC = "spec"; + private static final String DATA_FILE = "data-file"; + private static final String DELETE_FILES = "delete-files"; + private static final String RESIDUAL = "residual-filter"; + + private final boolean caseSensitive; + private final LoadingCache contentFileParsersBySpec; + + public FileScanTaskParser(boolean caseSensitive) { + this.caseSensitive = caseSensitive; + this.contentFileParsersBySpec = + Caffeine.newBuilder().weakKeys().build(spec -> new ContentFileParser(spec)); + } + + public String toJson(FileScanTask fileScanTask) { + try (StringWriter writer = new StringWriter()) { + JsonGenerator generator = JsonUtil.factory().createGenerator(writer); + toJson(fileScanTask, generator); + generator.flush(); + return writer.toString(); + } catch (IOException e) { + throw new UncheckedIOException("Failed to write json for: " + fileScanTask, e); + } + } + + void toJson(FileScanTask fileScanTask, JsonGenerator generator) throws IOException { + generator.writeStartObject(); + + generator.writeFieldName(SCHEMA); + SchemaParser.toJson(fileScanTask.schema(), generator); + + generator.writeFieldName(SPEC); + PartitionSpecParser.toJson(fileScanTask.spec(), generator); + + ContentFileParser contentFileParser = contentFileParsersBySpec.get(fileScanTask.spec()); + + if (fileScanTask.file() != null) { + generator.writeFieldName(DATA_FILE); + contentFileParser.toJson(fileScanTask.file(), generator); + } + + if (fileScanTask.deletes() != null) { + generator.writeArrayFieldStart(DELETE_FILES); + for (DeleteFile deleteFile : fileScanTask.deletes()) { + contentFileParser.toJson(deleteFile, generator); + } + generator.writeEndArray(); + } + + if (fileScanTask.residual() != null) { + generator.writeFieldName(RESIDUAL); + ExpressionParser.toJson(fileScanTask.residual(), generator); + } + + generator.writeEndObject(); + } + + public FileScanTask fromJson(String json) { + return JsonUtil.parse(json, this::fromJson); + } + + FileScanTask fromJson(JsonNode jsonNode) { + Preconditions.checkArgument( + jsonNode.isObject(), "Cannot parse file scan task from a non-object: %s", jsonNode); + + JsonNode schemaNode = jsonNode.get(SCHEMA); + Schema schema = SchemaParser.fromJson(schemaNode); + String schemaString = SchemaParser.toJson(schema); + + JsonNode specNode = jsonNode.get(SPEC); + PartitionSpec spec = PartitionSpecParser.fromJson(schema, specNode); + String specString = PartitionSpecParser.toJson(spec); + + ContentFileParser contentFileParser = contentFileParsersBySpec.get(spec); + + DataFile dataFile = null; + if (jsonNode.has(DATA_FILE)) { + dataFile = (DataFile) contentFileParser.fromJson(jsonNode.get(DATA_FILE)); + } + + DeleteFile[] deleteFiles = null; + if (jsonNode.has(DELETE_FILES)) { + JsonNode deletesArray = jsonNode.get(DELETE_FILES); + Preconditions.checkArgument( + deletesArray.isArray(), "Cannot parse delete files from a non-array: %s", deletesArray); + // parse the schema array + ImmutableList.Builder builder = ImmutableList.builder(); + for (JsonNode deleteFileNode : deletesArray) { + DeleteFile deleteFile = (DeleteFile) contentFileParser.fromJson(deleteFileNode); + builder.add(deleteFile); + } + + deleteFiles = builder.build().toArray(new DeleteFile[0]); + } + + Expression filter = Expressions.alwaysTrue(); + if (jsonNode.has(RESIDUAL)) { + filter = ExpressionParser.fromJson(jsonNode.get(RESIDUAL)); + } + + ResidualEvaluator residualEvaluator = ResidualEvaluator.of(spec, filter, caseSensitive); + return new BaseFileScanTask(dataFile, deleteFiles, schemaString, specString, residualEvaluator); + } +} diff --git a/core/src/main/java/org/apache/iceberg/GenericDataFile.java b/core/src/main/java/org/apache/iceberg/GenericDataFile.java index 34c65e669fb2..07c5172f1b3f 100644 --- a/core/src/main/java/org/apache/iceberg/GenericDataFile.java +++ b/core/src/main/java/org/apache/iceberg/GenericDataFile.java @@ -40,6 +40,7 @@ class GenericDataFile extends BaseFile implements DataFile { Metrics metrics, ByteBuffer keyMetadata, List splitOffsets, + int[] equalityFieldIds, Integer sortOrderId) { super( specId, @@ -56,7 +57,7 @@ class GenericDataFile extends BaseFile implements DataFile { metrics.lowerBounds(), metrics.upperBounds(), splitOffsets, - null, + equalityFieldIds, sortOrderId, keyMetadata); } diff --git a/core/src/test/java/org/apache/iceberg/TableTestBase.java b/core/src/test/java/org/apache/iceberg/TableTestBase.java index 038dfadbff05..a800214bc9a7 100644 --- a/core/src/test/java/org/apache/iceberg/TableTestBase.java +++ b/core/src/test/java/org/apache/iceberg/TableTestBase.java @@ -57,7 +57,7 @@ public class TableTestBase { protected static final int BUCKETS_NUMBER = 16; // Partition spec used to create tables - protected static final PartitionSpec SPEC = + public static final PartitionSpec SPEC = PartitionSpec.builderFor(SCHEMA).bucket("data", BUCKETS_NUMBER).build(); static final DataFile FILE_A = diff --git a/core/src/test/java/org/apache/iceberg/TestContentFileParser.java b/core/src/test/java/org/apache/iceberg/TestContentFileParser.java new file mode 100644 index 000000000000..77f7808870cb --- /dev/null +++ b/core/src/test/java/org/apache/iceberg/TestContentFileParser.java @@ -0,0 +1,229 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg; + +import com.fasterxml.jackson.core.JsonGenerator; +import com.fasterxml.jackson.databind.JsonNode; +import java.io.StringWriter; +import java.nio.ByteBuffer; +import java.util.Arrays; +import java.util.stream.Stream; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; +import org.apache.iceberg.types.Comparators; +import org.apache.iceberg.types.Conversions; +import org.apache.iceberg.types.Types; +import org.apache.iceberg.util.JsonUtil; +import org.junit.Assert; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.Arguments; +import org.junit.jupiter.params.provider.MethodSource; + +public class TestContentFileParser { + private static Stream provideSpecAndDataFile() { + return Stream.of( + Arguments.of( + PartitionSpec.unpartitioned(), dataFileWithRequiredOnly(PartitionSpec.unpartitioned())), + Arguments.of( + PartitionSpec.unpartitioned(), dataFileWithAllOptional(PartitionSpec.unpartitioned())), + Arguments.of(TableTestBase.SPEC, dataFileWithRequiredOnly(TableTestBase.SPEC)), + Arguments.of(TableTestBase.SPEC, dataFileWithAllOptional(TableTestBase.SPEC))); + } + + private static DataFile dataFileWithRequiredOnly(PartitionSpec spec) { + DataFiles.Builder builder = + DataFiles.builder(spec) + .withPath("/path/to/data-a.parquet") + .withFileSizeInBytes(10) + .withRecordCount(1); + + if (spec.isPartitioned()) { + // easy way to set partition data for now + builder.withPartitionPath("data_bucket=1"); + } + + return builder.build(); + } + + private static DataFile dataFileWithAllOptional(PartitionSpec spec) { + DataFiles.Builder builder = + DataFiles.builder(spec) + .withPath("/path/to/data-with-stats.parquet") + .withMetrics( + new Metrics( + 10L, // record count + ImmutableMap.of(3, 100L, 4, 200L), // column sizes + ImmutableMap.of(3, 90L, 4, 180L), // value counts + ImmutableMap.of(3, 10L, 4, 20L), // null value counts + ImmutableMap.of(3, 0L, 4, 0L), // nan value counts + ImmutableMap.of( + 3, + Conversions.toByteBuffer(Types.IntegerType.get(), 1), + 4, + Conversions.toByteBuffer(Types.IntegerType.get(), 2)), // lower bounds + ImmutableMap.of( + 3, + Conversions.toByteBuffer(Types.IntegerType.get(), 5), + 4, + Conversions.toByteBuffer(Types.IntegerType.get(), 10)) // upperbounds + )) + .withFileSizeInBytes(350) + .withSplitOffsets(Arrays.asList(128L, 256L)) + .withEqualityFieldIds(Arrays.asList(1)) + .withEncryptionKeyMetadata(ByteBuffer.wrap(new byte[16])) + .withSortOrder( + SortOrder.builderFor(TableTestBase.SCHEMA) + .withOrderId(1) + .sortBy("id", SortDirection.ASC, NullOrder.NULLS_FIRST) + .build()); + + if (spec.isPartitioned()) { + // easy way to set partition data for now + builder.withPartitionPath("data_bucket=1"); + } + + return builder.build(); + } + + private static Stream provideSpecAndDeleteFile() { + return Stream.of( + Arguments.of( + PartitionSpec.unpartitioned(), + deleteFileWithRequiredOnly(PartitionSpec.unpartitioned())), + Arguments.of( + PartitionSpec.unpartitioned(), + deleteFileWithAllOptional(PartitionSpec.unpartitioned())), + Arguments.of(TableTestBase.SPEC, deleteFileWithRequiredOnly(TableTestBase.SPEC)), + Arguments.of(TableTestBase.SPEC, deleteFileWithAllOptional(TableTestBase.SPEC))); + } + + private static DeleteFile deleteFileWithRequiredOnly(PartitionSpec spec) { + PartitionData partitionData = null; + if (spec.isPartitioned()) { + partitionData = new PartitionData(spec.partitionType()); + partitionData.set(0, 9); + } + + return new GenericDeleteFile( + spec.specId(), + FileContent.POSITION_DELETES, + "/path/to/delete-a.parquet", + FileFormat.PARQUET, + partitionData, + 1234, + new Metrics(9L, null, null, null, null), + null, + null, + null); + } + + private static DeleteFile deleteFileWithAllOptional(PartitionSpec spec) { + PartitionData partitionData = new PartitionData(spec.partitionType()); + if (spec.isPartitioned()) { + partitionData.set(0, 9); + } + + Metrics metrics = + new Metrics( + 10L, // record count + ImmutableMap.of(3, 100L, 4, 200L), // column sizes + ImmutableMap.of(3, 90L, 4, 180L), // value counts + ImmutableMap.of(3, 10L, 4, 20L), // null value counts + ImmutableMap.of(3, 0L, 4, 0L), // nan value counts + ImmutableMap.of( + 3, + Conversions.toByteBuffer(Types.IntegerType.get(), 1), + 4, + Conversions.toByteBuffer(Types.IntegerType.get(), 2)), // lower bounds + ImmutableMap.of( + 3, + Conversions.toByteBuffer(Types.IntegerType.get(), 5), + 4, + Conversions.toByteBuffer(Types.IntegerType.get(), 10)) // upperbounds + ); + + return new GenericDeleteFile( + spec.specId(), + FileContent.EQUALITY_DELETES, + "/path/to/delete-with-stats.parquet", + FileFormat.PARQUET, + partitionData, + 1234, + metrics, + new int[] {3}, + 1, + ByteBuffer.wrap(new byte[16])); + } + + @ParameterizedTest + @MethodSource("provideSpecAndDataFile") + public void testDataFile(PartitionSpec spec, DataFile dataFile) throws Exception { + ContentFileParser parser = new ContentFileParser(spec); + StringWriter stringWriter = new StringWriter(); + JsonGenerator jsonStringGenerator = JsonUtil.factory().createGenerator(stringWriter); + + parser.toJson(dataFile, jsonStringGenerator); + jsonStringGenerator.close(); + String jsonStr = stringWriter.toString(); + + JsonNode jsonNode = JsonUtil.mapper().readTree(jsonStr); + DataFile deserializedFile = (DataFile) parser.fromJson(jsonNode); + assertContentFileEquals(dataFile, deserializedFile, spec); + } + + @ParameterizedTest + @MethodSource("provideSpecAndDeleteFile") + public void testDeleteFile(PartitionSpec spec, DeleteFile deleteFile) throws Exception { + ContentFileParser parser = new ContentFileParser(spec); + StringWriter stringWriter = new StringWriter(); + JsonGenerator jsonStringGenerator = JsonUtil.factory().createGenerator(stringWriter); + + parser.toJson(deleteFile, jsonStringGenerator); + jsonStringGenerator.close(); + String jsonStr = stringWriter.toString(); + + JsonNode jsonNode = JsonUtil.mapper().readTree(jsonStr); + DeleteFile deserializedFile = (DeleteFile) parser.fromJson(jsonNode); + assertContentFileEquals(deleteFile, deserializedFile, spec); + } + + static void assertContentFileEquals( + ContentFile expected, ContentFile actual, PartitionSpec spec) { + Assert.assertEquals(expected.getClass(), actual.getClass()); + Assert.assertEquals(expected.specId(), actual.specId()); + Assert.assertEquals(expected.content(), actual.content()); + Assert.assertEquals(expected.path(), actual.path()); + Assert.assertEquals(expected.format(), actual.format()); + Assert.assertEquals( + 0, + Comparators.forType(spec.partitionType()) + .compare(expected.partition(), actual.partition())); + Assert.assertEquals(expected.recordCount(), actual.recordCount()); + Assert.assertEquals(expected.fileSizeInBytes(), actual.fileSizeInBytes()); + Assert.assertEquals(expected.columnSizes(), actual.columnSizes()); + Assert.assertEquals(expected.valueCounts(), actual.valueCounts()); + Assert.assertEquals(expected.nullValueCounts(), actual.nullValueCounts()); + Assert.assertEquals(expected.nanValueCounts(), actual.nanValueCounts()); + Assert.assertEquals(expected.lowerBounds(), actual.lowerBounds()); + Assert.assertEquals(expected.upperBounds(), actual.upperBounds()); + Assert.assertEquals(expected.keyMetadata(), actual.keyMetadata()); + Assert.assertEquals(expected.splitOffsets(), actual.splitOffsets()); + Assert.assertEquals(expected.equalityFieldIds(), actual.equalityFieldIds()); + Assert.assertEquals(expected.sortOrderId(), actual.sortOrderId()); + } +} diff --git a/core/src/test/java/org/apache/iceberg/TestFileScanTaskParser.java b/core/src/test/java/org/apache/iceberg/TestFileScanTaskParser.java new file mode 100644 index 000000000000..7708d14e6fd6 --- /dev/null +++ b/core/src/test/java/org/apache/iceberg/TestFileScanTaskParser.java @@ -0,0 +1,89 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg; + +import com.fasterxml.jackson.core.JsonGenerator; +import java.io.StringWriter; +import java.util.stream.Stream; +import org.apache.iceberg.expressions.ExpressionUtil; +import org.apache.iceberg.expressions.Expressions; +import org.apache.iceberg.expressions.ResidualEvaluator; +import org.apache.iceberg.util.JsonUtil; +import org.junit.Assert; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.Arguments; +import org.junit.jupiter.params.provider.MethodSource; + +public class TestFileScanTaskParser { + private static Stream provideArgs() { + return Stream.of(Arguments.of(true), Arguments.of(false)); + } + + @ParameterizedTest + @MethodSource("provideArgs") + public void testParser(boolean caseSensitive) throws Exception { + FileScanTaskParser parser = new FileScanTaskParser(caseSensitive); + StringWriter stringWriter = new StringWriter(); + JsonGenerator jsonStringGenerator = JsonUtil.factory().createGenerator(stringWriter); + PartitionSpec spec = TableTestBase.SPEC; + FileScanTask fileScanTask = createScanTask(spec, caseSensitive); + + parser.toJson(fileScanTask, jsonStringGenerator); + jsonStringGenerator.close(); + String jsonStr = stringWriter.toString(); + + FileScanTask deserializedTask = parser.fromJson(jsonStr); + assertFileScanTaskEquals(fileScanTask, deserializedTask, spec, caseSensitive); + } + + private FileScanTask createScanTask(PartitionSpec spec, boolean caseSensitive) { + ResidualEvaluator residualEvaluator; + if (spec.isUnpartitioned()) { + residualEvaluator = ResidualEvaluator.unpartitioned(Expressions.alwaysTrue()); + } else { + residualEvaluator = ResidualEvaluator.of(spec, Expressions.equal("id", 1), caseSensitive); + } + + return new BaseFileScanTask( + TableTestBase.FILE_A, + new DeleteFile[] {TableTestBase.FILE_A_DELETES, TableTestBase.FILE_A2_DELETES}, + SchemaParser.toJson(TableTestBase.SCHEMA), + PartitionSpecParser.toJson(spec), + residualEvaluator); + } + + private static void assertFileScanTaskEquals( + FileScanTask expected, FileScanTask actual, PartitionSpec spec, boolean caseSensitive) { + TestContentFileParser.assertContentFileEquals(expected.file(), actual.file(), spec); + Assert.assertEquals(expected.deletes().size(), actual.deletes().size()); + for (int pos = 0; pos < expected.deletes().size(); ++pos) { + TestContentFileParser.assertContentFileEquals( + expected.deletes().get(pos), actual.deletes().get(pos), spec); + } + + Assert.assertTrue("Schema should be the same", expected.schema().sameSchema(actual.schema())); + Assert.assertEquals(expected.spec(), actual.spec()); + Assert.assertTrue( + ExpressionUtil.equivalent( + expected.residual(), + actual.residual(), + TableTestBase.SCHEMA.asStruct(), + caseSensitive)); + } +} diff --git a/core/src/test/java/org/apache/iceberg/TestManifestWriterVersions.java b/core/src/test/java/org/apache/iceberg/TestManifestWriterVersions.java index 740791b255d5..08b27d7460da 100644 --- a/core/src/test/java/org/apache/iceberg/TestManifestWriterVersions.java +++ b/core/src/test/java/org/apache/iceberg/TestManifestWriterVersions.java @@ -78,7 +78,7 @@ public class TestManifestWriterVersions { private static final DataFile DATA_FILE = new GenericDataFile( - 0, PATH, FORMAT, PARTITION, 150972L, METRICS, null, OFFSETS, SORT_ORDER_ID); + 0, PATH, FORMAT, PARTITION, 150972L, METRICS, null, OFFSETS, null, SORT_ORDER_ID); private static final List EQUALITY_IDS = ImmutableList.of(1); private static final int[] EQUALITY_ID_ARR = new int[] {1}; From e3dc0b88583e36b73c5ba40654e341d8f70e68ad Mon Sep 17 00:00:00 2001 From: Steven Wu Date: Fri, 24 Feb 2023 11:22:22 -0800 Subject: [PATCH 02/12] Update spec doc with JSON parser for content file and file scan task. --- format/spec.md | 35 +++++++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) diff --git a/format/spec.md b/format/spec.md index 58cfc2291114..60c0f99c3f90 100644 --- a/format/spec.md +++ b/format/spec.md @@ -1128,6 +1128,41 @@ Example ] } ] ``` +### Content File (Data and Delete) Serialization + +Content file (data or delete) is serialized as a JSON object according to the following table. + +| Metadata field |JSON representation|Example| +|--------------------------|--- |--- | +| **`spec-id`** |`JSON int`|`1`| +| **`content`** |`JSON string`|`DATA`, `POSITION_DELETES`, `EQUALITY_DELETES`| +| **`file-path`** |`JSON string`|`"s3://b/wh/data.db/table"`| +| **`file-format`** |`JSON string`|`AVRO`, `ORC`, `PARQUET`| +| **`partition`** |`JSON object: Partition data tuple using partition field ids for the struct field ids`|`{"1000":1}`| +| **`record-count`** |`JSON long`|`1`| +| **`file-size-in-bytes`** |`JSON long`|`1024`| +| **`column-sizes`** |`JSON object: Map from column id to the total size on disk of all regions that store the column.`|`{"keys":[3,4],"values":[100,200]}`| +| **`value-counts`** |`JSON object: Map from column id to number of values in the column (including null and NaN values)`|`{"keys":[3,4],"values":[90,180]}`| +| **`null-value-counts`** |`JSON object: Map from column id to number of null values in the column`|`{"keys":[3,4],"values":[10,20]}`| +| **`nan-value-counts`** |`JSON object: Map from column id to number of NaN values in the column`|`{"keys":[3,4],"values":[0,0]}`| +| **`lower-bounds`** |`JSON object: Map from column id to lower bound binary in the column serialized as hexadecimal string`|`{"keys":[3,4],"values":["01000000","02000000"]}`| +| **`upper-bounds`** |`JSON object: Map from column id to upper bound binary in the column serialized as hexadecimal string`|`{"keys":[3,4],"values":["05000000","0A000000"]}`| +| **`key-metadata`** |`JSON string: Encryption key metadata binary serialized as hexadecimal string`|`00000000000000000000000000000000`| +| **`split-offsets`** |`JSON list of long: Split offsets for the data file`|`[128,256]`| +| **`equality-ids`** |`JSON list of int: Field ids used to determine row equality in equality delete files`|`[1]`| +| **`sort-order-id`** |`JSON int`|`1`| + +### File Scan Task Serialization + +File scan task is serialized as a JSON object according to the following table. + +| Metadata field |JSON representation|Example| +|--------------------------|--- |--- | +| **`schema`** |`JSON object`|`See above, read schemas instead`| +| **`spec`** |`JSON object`|`See above, read partition specs instead`| +| **`data-file`** |`JSON object`|`See above, read content file instead`| +| **`delete-files`** |`JSON list of objects`|`See above, read content file instead`| +| **`residual-filter`** |`JSON object: residual filter expression`|`{"type":"eq","term":"id","value":1}`| ## Appendix D: Single-value serialization From 31c5c8ae0566c7fd767fc2d2551e4e4987cb44bf Mon Sep 17 00:00:00 2001 From: Steven Wu Date: Mon, 20 Mar 2023 14:49:51 -0700 Subject: [PATCH 03/12] Address Eduard's initial round of comments --- .../org/apache/iceberg/ContentFileParser.java | 56 +++++----------- .../apache/iceberg/FileScanTaskParser.java | 44 ++++++------- .../org/apache/iceberg/util/JsonUtil.java | 27 ++++++++ .../apache/iceberg/TestContentFileParser.java | 32 ++++++--- .../iceberg/TestFileScanTaskParser.java | 29 ++++---- .../org/apache/iceberg/util/TestJsonUtil.java | 66 +++++++++++++++++++ 6 files changed, 168 insertions(+), 86 deletions(-) diff --git a/core/src/main/java/org/apache/iceberg/ContentFileParser.java b/core/src/main/java/org/apache/iceberg/ContentFileParser.java index c777fc798ed6..86ec802ad28e 100644 --- a/core/src/main/java/org/apache/iceberg/ContentFileParser.java +++ b/core/src/main/java/org/apache/iceberg/ContentFileParser.java @@ -25,7 +25,6 @@ import java.util.List; import java.util.Map; import org.apache.iceberg.relocated.com.google.common.base.Preconditions; -import org.apache.iceberg.util.ArrayUtil; import org.apache.iceberg.util.JsonUtil; class ContentFileParser { @@ -47,17 +46,16 @@ class ContentFileParser { private static final String EQUALITY_IDS = "equality-ids"; private static final String SORT_ORDER_ID = "sort-order-id"; - private final PartitionSpec spec; + private ContentFileParser() {} - ContentFileParser(PartitionSpec spec) { - this.spec = spec; - } - - private boolean hasPartitionData(StructLike partitionData) { + private static boolean hasPartitionData(StructLike partitionData) { return partitionData != null && partitionData.size() > 0; } - void toJson(ContentFile contentFile, JsonGenerator generator) throws IOException { + static void toJson(ContentFile contentFile, PartitionSpec spec, JsonGenerator generator) + throws IOException { + Preconditions.checkNotNull(contentFile, "Content file cannot be null"); + generator.writeStartObject(); // ignore the ordinal position (ContentFile#pos) of the file in a manifest, @@ -143,14 +141,15 @@ void toJson(ContentFile contentFile, JsonGenerator generator) throws IOException generator.writeEndObject(); } - ContentFile fromJson(JsonNode jsonNode) { + static ContentFile fromJson(JsonNode jsonNode, PartitionSpec spec) { + Preconditions.checkNotNull(jsonNode, "Cannot parse content file from null JSON node"); Preconditions.checkArgument( jsonNode.isObject(), "Cannot parse content file from a non-object: %s", jsonNode); int specId = JsonUtil.getInt(SPEC_ID, jsonNode); FileContent fileContent = FileContent.valueOf(JsonUtil.getString(CONTENT, jsonNode)); String filePath = JsonUtil.getString(FILE_PATH, jsonNode); - FileFormat fileFormat = FileFormat.valueOf(JsonUtil.getString(FILE_FORMAT, jsonNode)); + FileFormat fileFormat = FileFormat.fromString(JsonUtil.getString(FILE_FORMAT, jsonNode)); PartitionData partitionData = null; if (jsonNode.has(PARTITION)) { @@ -169,36 +168,11 @@ ContentFile fromJson(JsonNode jsonNode) { } long fileSizeInBytes = JsonUtil.getLong(FILE_SIZE, jsonNode); - Metrics metrics = toMetrics(jsonNode); - - ByteBuffer keyMetadata = null; - if (jsonNode.has(KEY_METADATA)) { - keyMetadata = - (ByteBuffer) - SingleValueParser.fromJson(DataFile.KEY_METADATA.type(), jsonNode.get(KEY_METADATA)); - } - - List splitOffsets = null; - if (jsonNode.has(SPLIT_OFFSETS)) { - splitOffsets = - (List) - SingleValueParser.fromJson( - DataFile.SPLIT_OFFSETS.type(), jsonNode.get(SPLIT_OFFSETS)); - } - - int[] equalityFieldIds = null; - if (jsonNode.has(EQUALITY_IDS)) { - equalityFieldIds = - ArrayUtil.toIntArray( - (List) - SingleValueParser.fromJson( - DataFile.EQUALITY_IDS.type(), jsonNode.get(EQUALITY_IDS))); - } - - Integer sortOrderId = null; - if (jsonNode.has(SORT_ORDER_ID)) { - sortOrderId = JsonUtil.getInt(SORT_ORDER_ID, jsonNode); - } + Metrics metrics = metricsFromJson(jsonNode); + ByteBuffer keyMetadata = JsonUtil.getByteBufferOrNull(KEY_METADATA, jsonNode); + List splitOffsets = JsonUtil.getLongListOrNull(SPLIT_OFFSETS, jsonNode); + int[] equalityFieldIds = JsonUtil.getIntArrayOrNull(EQUALITY_IDS, jsonNode); + Integer sortOrderId = JsonUtil.getIntOrNull(SORT_ORDER_ID, jsonNode); if (fileContent == FileContent.DATA) { return new GenericDataFile( @@ -227,7 +201,7 @@ ContentFile fromJson(JsonNode jsonNode) { } } - private Metrics toMetrics(JsonNode jsonNode) { + private static Metrics metricsFromJson(JsonNode jsonNode) { long recordCount = JsonUtil.getLong(RECORD_COUNT, jsonNode); Map columnSizes = null; diff --git a/core/src/main/java/org/apache/iceberg/FileScanTaskParser.java b/core/src/main/java/org/apache/iceberg/FileScanTaskParser.java index 6399c455af2b..d24546e9b3bd 100644 --- a/core/src/main/java/org/apache/iceberg/FileScanTaskParser.java +++ b/core/src/main/java/org/apache/iceberg/FileScanTaskParser.java @@ -20,8 +20,6 @@ import com.fasterxml.jackson.core.JsonGenerator; import com.fasterxml.jackson.databind.JsonNode; -import com.github.benmanes.caffeine.cache.Caffeine; -import com.github.benmanes.caffeine.cache.LoadingCache; import java.io.IOException; import java.io.StringWriter; import java.io.UncheckedIOException; @@ -40,16 +38,10 @@ public class FileScanTaskParser { private static final String DELETE_FILES = "delete-files"; private static final String RESIDUAL = "residual-filter"; - private final boolean caseSensitive; - private final LoadingCache contentFileParsersBySpec; + private FileScanTaskParser() {} - public FileScanTaskParser(boolean caseSensitive) { - this.caseSensitive = caseSensitive; - this.contentFileParsersBySpec = - Caffeine.newBuilder().weakKeys().build(spec -> new ContentFileParser(spec)); - } - - public String toJson(FileScanTask fileScanTask) { + public static String toJson(FileScanTask fileScanTask) { + Preconditions.checkNotNull(fileScanTask, "File scan task cannot be null"); try (StringWriter writer = new StringWriter()) { JsonGenerator generator = JsonUtil.factory().createGenerator(writer); toJson(fileScanTask, generator); @@ -60,26 +52,26 @@ public String toJson(FileScanTask fileScanTask) { } } - void toJson(FileScanTask fileScanTask, JsonGenerator generator) throws IOException { + private static void toJson(FileScanTask fileScanTask, JsonGenerator generator) + throws IOException { generator.writeStartObject(); generator.writeFieldName(SCHEMA); SchemaParser.toJson(fileScanTask.schema(), generator); generator.writeFieldName(SPEC); - PartitionSpecParser.toJson(fileScanTask.spec(), generator); - - ContentFileParser contentFileParser = contentFileParsersBySpec.get(fileScanTask.spec()); + PartitionSpec spec = fileScanTask.spec(); + PartitionSpecParser.toJson(spec, generator); if (fileScanTask.file() != null) { generator.writeFieldName(DATA_FILE); - contentFileParser.toJson(fileScanTask.file(), generator); + ContentFileParser.toJson(fileScanTask.file(), spec, generator); } if (fileScanTask.deletes() != null) { generator.writeArrayFieldStart(DELETE_FILES); for (DeleteFile deleteFile : fileScanTask.deletes()) { - contentFileParser.toJson(deleteFile, generator); + ContentFileParser.toJson(deleteFile, spec, generator); } generator.writeEndArray(); } @@ -92,11 +84,17 @@ void toJson(FileScanTask fileScanTask, JsonGenerator generator) throws IOExcepti generator.writeEndObject(); } - public FileScanTask fromJson(String json) { - return JsonUtil.parse(json, this::fromJson); + public static FileScanTask fromJson(String json, boolean caseSensitive) { + Preconditions.checkNotNull(json, "Cannot parse file scan task from null JSON string"); + try { + JsonNode jsonNode = JsonUtil.mapper().readValue(json, JsonNode.class); + return fromJsonNode(jsonNode, caseSensitive); + } catch (IOException e) { + throw new UncheckedIOException(e); + } } - FileScanTask fromJson(JsonNode jsonNode) { + private static FileScanTask fromJsonNode(JsonNode jsonNode, boolean caseSensitive) { Preconditions.checkArgument( jsonNode.isObject(), "Cannot parse file scan task from a non-object: %s", jsonNode); @@ -108,11 +106,9 @@ FileScanTask fromJson(JsonNode jsonNode) { PartitionSpec spec = PartitionSpecParser.fromJson(schema, specNode); String specString = PartitionSpecParser.toJson(spec); - ContentFileParser contentFileParser = contentFileParsersBySpec.get(spec); - DataFile dataFile = null; if (jsonNode.has(DATA_FILE)) { - dataFile = (DataFile) contentFileParser.fromJson(jsonNode.get(DATA_FILE)); + dataFile = (DataFile) ContentFileParser.fromJson(jsonNode.get(DATA_FILE), spec); } DeleteFile[] deleteFiles = null; @@ -123,7 +119,7 @@ FileScanTask fromJson(JsonNode jsonNode) { // parse the schema array ImmutableList.Builder builder = ImmutableList.builder(); for (JsonNode deleteFileNode : deletesArray) { - DeleteFile deleteFile = (DeleteFile) contentFileParser.fromJson(deleteFileNode); + DeleteFile deleteFile = (DeleteFile) ContentFileParser.fromJson(deleteFileNode, spec); builder.add(deleteFile); } diff --git a/core/src/main/java/org/apache/iceberg/util/JsonUtil.java b/core/src/main/java/org/apache/iceberg/util/JsonUtil.java index 84c0681164d6..896e777d9630 100644 --- a/core/src/main/java/org/apache/iceberg/util/JsonUtil.java +++ b/core/src/main/java/org/apache/iceberg/util/JsonUtil.java @@ -26,14 +26,17 @@ import java.io.IOException; import java.io.StringWriter; import java.io.UncheckedIOException; +import java.nio.ByteBuffer; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Set; +import org.apache.iceberg.SingleValueParser; import org.apache.iceberg.relocated.com.google.common.base.Preconditions; import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; import org.apache.iceberg.relocated.com.google.common.collect.ImmutableSet; +import org.apache.iceberg.types.Types; public class JsonUtil { @@ -173,6 +176,14 @@ public static String getStringOrNull(String property, JsonNode node) { return getString(property, node); } + public static ByteBuffer getByteBufferOrNull(String property, JsonNode node) { + if (!node.has(property) || node.get(property).isNull()) { + return null; + } + + return (ByteBuffer) SingleValueParser.fromJson(Types.BinaryType.get(), node.get(property)); + } + public static Map getStringMap(String property, JsonNode node) { Preconditions.checkArgument(node.has(property), "Cannot parse missing map: %s", property); JsonNode pNode = node.get(property); @@ -229,6 +240,14 @@ public static List getStringListOrNull(String property, JsonNode node) { .build(); } + public static int[] getIntArrayOrNull(String property, JsonNode node) { + if (!node.has(property) || node.get(property).isNull()) { + return null; + } + + return ArrayUtil.toIntArray(getIntegerList(property, node)); + } + public static List getIntegerList(String property, JsonNode node) { Preconditions.checkArgument(node.has(property), "Cannot parse missing list: %s", property); return ImmutableList.builder() @@ -256,6 +275,14 @@ public static List getLongList(String property, JsonNode node) { return ImmutableList.builder().addAll(new JsonLongArrayIterator(property, node)).build(); } + public static List getLongListOrNull(String property, JsonNode node) { + if (!node.has(property) || node.get(property).isNull()) { + return null; + } + + return ImmutableList.builder().addAll(new JsonLongArrayIterator(property, node)).build(); + } + public static Set getLongSetOrNull(String property, JsonNode node) { if (!node.hasNonNull(property)) { return null; diff --git a/core/src/test/java/org/apache/iceberg/TestContentFileParser.java b/core/src/test/java/org/apache/iceberg/TestContentFileParser.java index 77f7808870cb..12ffc0af4eab 100644 --- a/core/src/test/java/org/apache/iceberg/TestContentFileParser.java +++ b/core/src/test/java/org/apache/iceberg/TestContentFileParser.java @@ -23,18 +23,36 @@ import java.io.StringWriter; import java.nio.ByteBuffer; import java.util.Arrays; +import java.util.Collections; import java.util.stream.Stream; import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; import org.apache.iceberg.types.Comparators; import org.apache.iceberg.types.Conversions; import org.apache.iceberg.types.Types; import org.apache.iceberg.util.JsonUtil; +import org.assertj.core.api.Assertions; import org.junit.Assert; +import org.junit.Test; import org.junit.jupiter.params.ParameterizedTest; import org.junit.jupiter.params.provider.Arguments; import org.junit.jupiter.params.provider.MethodSource; public class TestContentFileParser { + @Test + public void testNullArguments() throws Exception { + StringWriter stringWriter = new StringWriter(); + JsonGenerator jsonStringGenerator = JsonUtil.factory().createGenerator(stringWriter); + + Assertions.assertThatThrownBy( + () -> ContentFileParser.toJson(null, TableTestBase.SPEC, jsonStringGenerator)) + .isInstanceOf(NullPointerException.class) + .hasMessage("Content file cannot be null"); + + Assertions.assertThatThrownBy(() -> ContentFileParser.fromJson(null, TableTestBase.SPEC)) + .isInstanceOf(NullPointerException.class) + .hasMessage("Cannot parse content file from null JSON node"); + } + private static Stream provideSpecAndDataFile() { return Stream.of( Arguments.of( @@ -84,7 +102,7 @@ private static DataFile dataFileWithAllOptional(PartitionSpec spec) { )) .withFileSizeInBytes(350) .withSplitOffsets(Arrays.asList(128L, 256L)) - .withEqualityFieldIds(Arrays.asList(1)) + .withEqualityFieldIds(Collections.singletonList(1)) .withEncryptionKeyMetadata(ByteBuffer.wrap(new byte[16])) .withSortOrder( SortOrder.builderFor(TableTestBase.SCHEMA) @@ -173,37 +191,35 @@ private static DeleteFile deleteFileWithAllOptional(PartitionSpec spec) { @ParameterizedTest @MethodSource("provideSpecAndDataFile") public void testDataFile(PartitionSpec spec, DataFile dataFile) throws Exception { - ContentFileParser parser = new ContentFileParser(spec); StringWriter stringWriter = new StringWriter(); JsonGenerator jsonStringGenerator = JsonUtil.factory().createGenerator(stringWriter); - parser.toJson(dataFile, jsonStringGenerator); + ContentFileParser.toJson(dataFile, spec, jsonStringGenerator); jsonStringGenerator.close(); String jsonStr = stringWriter.toString(); JsonNode jsonNode = JsonUtil.mapper().readTree(jsonStr); - DataFile deserializedFile = (DataFile) parser.fromJson(jsonNode); + DataFile deserializedFile = (DataFile) ContentFileParser.fromJson(jsonNode, spec); assertContentFileEquals(dataFile, deserializedFile, spec); } @ParameterizedTest @MethodSource("provideSpecAndDeleteFile") public void testDeleteFile(PartitionSpec spec, DeleteFile deleteFile) throws Exception { - ContentFileParser parser = new ContentFileParser(spec); StringWriter stringWriter = new StringWriter(); JsonGenerator jsonStringGenerator = JsonUtil.factory().createGenerator(stringWriter); - parser.toJson(deleteFile, jsonStringGenerator); + ContentFileParser.toJson(deleteFile, spec, jsonStringGenerator); jsonStringGenerator.close(); String jsonStr = stringWriter.toString(); JsonNode jsonNode = JsonUtil.mapper().readTree(jsonStr); - DeleteFile deserializedFile = (DeleteFile) parser.fromJson(jsonNode); + DeleteFile deserializedFile = (DeleteFile) ContentFileParser.fromJson(jsonNode, spec); assertContentFileEquals(deleteFile, deserializedFile, spec); } static void assertContentFileEquals( - ContentFile expected, ContentFile actual, PartitionSpec spec) { + ContentFile expected, ContentFile actual, PartitionSpec spec) { Assert.assertEquals(expected.getClass(), actual.getClass()); Assert.assertEquals(expected.specId(), actual.specId()); Assert.assertEquals(expected.content(), actual.content()); diff --git a/core/src/test/java/org/apache/iceberg/TestFileScanTaskParser.java b/core/src/test/java/org/apache/iceberg/TestFileScanTaskParser.java index 7708d14e6fd6..5da0af624d68 100644 --- a/core/src/test/java/org/apache/iceberg/TestFileScanTaskParser.java +++ b/core/src/test/java/org/apache/iceberg/TestFileScanTaskParser.java @@ -18,37 +18,40 @@ */ package org.apache.iceberg; -import com.fasterxml.jackson.core.JsonGenerator; -import java.io.StringWriter; import java.util.stream.Stream; import org.apache.iceberg.expressions.ExpressionUtil; import org.apache.iceberg.expressions.Expressions; import org.apache.iceberg.expressions.ResidualEvaluator; -import org.apache.iceberg.util.JsonUtil; +import org.assertj.core.api.Assertions; import org.junit.Assert; +import org.junit.Test; import org.junit.jupiter.params.ParameterizedTest; import org.junit.jupiter.params.provider.Arguments; import org.junit.jupiter.params.provider.MethodSource; public class TestFileScanTaskParser { + @Test + public void testNullArguments() { + Assertions.assertThatThrownBy(() -> FileScanTaskParser.toJson(null)) + .isInstanceOf(NullPointerException.class) + .hasMessage("File scan task cannot be null"); + + Assertions.assertThatThrownBy(() -> FileScanTaskParser.fromJson(null, true)) + .isInstanceOf(NullPointerException.class) + .hasMessage("Cannot parse file scan task from null JSON string"); + } + private static Stream provideArgs() { return Stream.of(Arguments.of(true), Arguments.of(false)); } @ParameterizedTest @MethodSource("provideArgs") - public void testParser(boolean caseSensitive) throws Exception { - FileScanTaskParser parser = new FileScanTaskParser(caseSensitive); - StringWriter stringWriter = new StringWriter(); - JsonGenerator jsonStringGenerator = JsonUtil.factory().createGenerator(stringWriter); + public void testParser(boolean caseSensitive) { PartitionSpec spec = TableTestBase.SPEC; FileScanTask fileScanTask = createScanTask(spec, caseSensitive); - - parser.toJson(fileScanTask, jsonStringGenerator); - jsonStringGenerator.close(); - String jsonStr = stringWriter.toString(); - - FileScanTask deserializedTask = parser.fromJson(jsonStr); + String jsonStr = FileScanTaskParser.toJson(fileScanTask); + FileScanTask deserializedTask = FileScanTaskParser.fromJson(jsonStr, caseSensitive); assertFileScanTaskEquals(fileScanTask, deserializedTask, spec, caseSensitive); } diff --git a/core/src/test/java/org/apache/iceberg/util/TestJsonUtil.java b/core/src/test/java/org/apache/iceberg/util/TestJsonUtil.java index 8a769fccb9e2..c8ba7b61efb5 100644 --- a/core/src/test/java/org/apache/iceberg/util/TestJsonUtil.java +++ b/core/src/test/java/org/apache/iceberg/util/TestJsonUtil.java @@ -19,10 +19,12 @@ package org.apache.iceberg.util; import com.fasterxml.jackson.core.JsonProcessingException; +import java.nio.ByteBuffer; import java.util.Arrays; import java.util.List; import java.util.Map; import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; +import org.apache.iceberg.relocated.com.google.common.io.BaseEncoding; import org.assertj.core.api.Assertions; import org.junit.Test; @@ -167,6 +169,26 @@ public void getStringOrNull() throws JsonProcessingException { .hasMessage("Cannot parse to a string value: x: 23"); } + @Test + public void getByteBufferOrNull() throws JsonProcessingException { + Assertions.assertThat(JsonUtil.getByteBufferOrNull("x", JsonUtil.mapper().readTree("{}"))) + .isNull(); + Assertions.assertThat( + JsonUtil.getByteBufferOrNull("x", JsonUtil.mapper().readTree("{\"x\": null}"))) + .isNull(); + + byte[] bytes = new byte[] {1, 2, 3, 4}; + String base16Str = BaseEncoding.base16().encode(bytes); + String json = String.format("{\"x\": \"%s\"}", base16Str); + ByteBuffer byteBuffer = JsonUtil.getByteBufferOrNull("x", JsonUtil.mapper().readTree(json)); + Assertions.assertThat(byteBuffer.array()).isEqualTo(bytes); + + Assertions.assertThatThrownBy( + () -> JsonUtil.getByteBufferOrNull("x", JsonUtil.mapper().readTree("{\"x\": 23}"))) + .isInstanceOf(IllegalArgumentException.class) + .hasMessage("Cannot parse default as a binary value: 23"); + } + @Test public void getBool() throws JsonProcessingException { Assertions.assertThatThrownBy(() -> JsonUtil.getBool("x", JsonUtil.mapper().readTree("{}"))) @@ -194,6 +216,28 @@ public void getBool() throws JsonProcessingException { .isFalse(); } + @Test + public void getIntArrayOrNull() throws JsonProcessingException { + Assertions.assertThat(JsonUtil.getIntArrayOrNull("items", JsonUtil.mapper().readTree("{}"))) + .isNull(); + + Assertions.assertThat( + JsonUtil.getIntArrayOrNull("items", JsonUtil.mapper().readTree("{\"items\": null}"))) + .isNull(); + + Assertions.assertThatThrownBy( + () -> + JsonUtil.getIntArrayOrNull( + "items", JsonUtil.mapper().readTree("{\"items\": [13, \"23\"]}"))) + .isInstanceOf(IllegalArgumentException.class) + .hasMessage("Cannot parse integer from non-int value in items: \"23\""); + + Assertions.assertThat( + JsonUtil.getIntArrayOrNull( + "items", JsonUtil.mapper().readTree("{\"items\": [23, 45]}"))) + .isEqualTo(new int[] {23, 45}); + } + @Test public void getIntegerList() throws JsonProcessingException { Assertions.assertThatThrownBy( @@ -312,6 +356,28 @@ public void getLongList() throws JsonProcessingException { .isEqualTo(items); } + @Test + public void getLongListOrNull() throws JsonProcessingException { + Assertions.assertThat(JsonUtil.getLongListOrNull("items", JsonUtil.mapper().readTree("{}"))) + .isNull(); + + Assertions.assertThat( + JsonUtil.getLongListOrNull("items", JsonUtil.mapper().readTree("{\"items\": null}"))) + .isNull(); + + Assertions.assertThatThrownBy( + () -> + JsonUtil.getLongListOrNull( + "items", JsonUtil.mapper().readTree("{\"items\": [13, \"23\"]}"))) + .isInstanceOf(IllegalArgumentException.class) + .hasMessage("Cannot parse long from non-long value in items: \"23\""); + + Assertions.assertThat( + JsonUtil.getLongListOrNull( + "items", JsonUtil.mapper().readTree("{\"items\": [23, 45]}"))) + .containsExactlyElementsOf(Arrays.asList(23L, 45L)); + } + @Test public void getLongSet() throws JsonProcessingException { Assertions.assertThatThrownBy( From ffe477077943b767abb39c7426c0d7234bd61c85 Mon Sep 17 00:00:00 2001 From: Steven Wu Date: Tue, 21 Mar 2023 09:11:34 -0700 Subject: [PATCH 04/12] switch to Preconditions.checkArgument --- .../src/main/java/org/apache/iceberg/ContentFileParser.java | 6 ++++-- .../main/java/org/apache/iceberg/FileScanTaskParser.java | 4 ++-- .../test/java/org/apache/iceberg/TestContentFileParser.java | 4 ++-- .../java/org/apache/iceberg/TestFileScanTaskParser.java | 4 ++-- 4 files changed, 10 insertions(+), 8 deletions(-) diff --git a/core/src/main/java/org/apache/iceberg/ContentFileParser.java b/core/src/main/java/org/apache/iceberg/ContentFileParser.java index 86ec802ad28e..638cd2fa7e44 100644 --- a/core/src/main/java/org/apache/iceberg/ContentFileParser.java +++ b/core/src/main/java/org/apache/iceberg/ContentFileParser.java @@ -54,7 +54,9 @@ private static boolean hasPartitionData(StructLike partitionData) { static void toJson(ContentFile contentFile, PartitionSpec spec, JsonGenerator generator) throws IOException { - Preconditions.checkNotNull(contentFile, "Content file cannot be null"); + Preconditions.checkArgument(contentFile != null, "Content file cannot be null"); + Preconditions.checkArgument(spec != null, "Partition spec cannot be null"); + Preconditions.checkArgument(generator != null, "JSON generator cannot be null"); generator.writeStartObject(); @@ -142,7 +144,7 @@ static void toJson(ContentFile contentFile, PartitionSpec spec, JsonGenerator } static ContentFile fromJson(JsonNode jsonNode, PartitionSpec spec) { - Preconditions.checkNotNull(jsonNode, "Cannot parse content file from null JSON node"); + Preconditions.checkArgument(jsonNode != null, "Cannot parse content file from null JSON node"); Preconditions.checkArgument( jsonNode.isObject(), "Cannot parse content file from a non-object: %s", jsonNode); diff --git a/core/src/main/java/org/apache/iceberg/FileScanTaskParser.java b/core/src/main/java/org/apache/iceberg/FileScanTaskParser.java index d24546e9b3bd..42a1d15caa92 100644 --- a/core/src/main/java/org/apache/iceberg/FileScanTaskParser.java +++ b/core/src/main/java/org/apache/iceberg/FileScanTaskParser.java @@ -41,7 +41,7 @@ public class FileScanTaskParser { private FileScanTaskParser() {} public static String toJson(FileScanTask fileScanTask) { - Preconditions.checkNotNull(fileScanTask, "File scan task cannot be null"); + Preconditions.checkArgument(fileScanTask != null, "File scan task cannot be null"); try (StringWriter writer = new StringWriter()) { JsonGenerator generator = JsonUtil.factory().createGenerator(writer); toJson(fileScanTask, generator); @@ -85,7 +85,7 @@ private static void toJson(FileScanTask fileScanTask, JsonGenerator generator) } public static FileScanTask fromJson(String json, boolean caseSensitive) { - Preconditions.checkNotNull(json, "Cannot parse file scan task from null JSON string"); + Preconditions.checkArgument(json != null, "Cannot parse file scan task from null JSON string"); try { JsonNode jsonNode = JsonUtil.mapper().readValue(json, JsonNode.class); return fromJsonNode(jsonNode, caseSensitive); diff --git a/core/src/test/java/org/apache/iceberg/TestContentFileParser.java b/core/src/test/java/org/apache/iceberg/TestContentFileParser.java index 12ffc0af4eab..4238757f62cf 100644 --- a/core/src/test/java/org/apache/iceberg/TestContentFileParser.java +++ b/core/src/test/java/org/apache/iceberg/TestContentFileParser.java @@ -45,11 +45,11 @@ public void testNullArguments() throws Exception { Assertions.assertThatThrownBy( () -> ContentFileParser.toJson(null, TableTestBase.SPEC, jsonStringGenerator)) - .isInstanceOf(NullPointerException.class) + .isInstanceOf(IllegalArgumentException.class) .hasMessage("Content file cannot be null"); Assertions.assertThatThrownBy(() -> ContentFileParser.fromJson(null, TableTestBase.SPEC)) - .isInstanceOf(NullPointerException.class) + .isInstanceOf(IllegalArgumentException.class) .hasMessage("Cannot parse content file from null JSON node"); } diff --git a/core/src/test/java/org/apache/iceberg/TestFileScanTaskParser.java b/core/src/test/java/org/apache/iceberg/TestFileScanTaskParser.java index 5da0af624d68..c65e0983de43 100644 --- a/core/src/test/java/org/apache/iceberg/TestFileScanTaskParser.java +++ b/core/src/test/java/org/apache/iceberg/TestFileScanTaskParser.java @@ -33,11 +33,11 @@ public class TestFileScanTaskParser { @Test public void testNullArguments() { Assertions.assertThatThrownBy(() -> FileScanTaskParser.toJson(null)) - .isInstanceOf(NullPointerException.class) + .isInstanceOf(IllegalArgumentException.class) .hasMessage("File scan task cannot be null"); Assertions.assertThatThrownBy(() -> FileScanTaskParser.fromJson(null, true)) - .isInstanceOf(NullPointerException.class) + .isInstanceOf(IllegalArgumentException.class) .hasMessage("Cannot parse file scan task from null JSON string"); } From c1d913dbf161ed8fd48fb0d0607030f440b779a9 Mon Sep 17 00:00:00 2001 From: Steven Wu Date: Tue, 4 Apr 2023 19:34:17 -0700 Subject: [PATCH 05/12] address Eduard's second round of comments --- .../org/apache/iceberg/ContentFileParser.java | 6 ++ .../apache/iceberg/FileScanTaskParser.java | 13 +--- .../apache/iceberg/TestContentFileParser.java | 73 +++++++++---------- .../iceberg/TestFileScanTaskParser.java | 14 +--- 4 files changed, 49 insertions(+), 57 deletions(-) diff --git a/core/src/main/java/org/apache/iceberg/ContentFileParser.java b/core/src/main/java/org/apache/iceberg/ContentFileParser.java index 638cd2fa7e44..6c05ca5cc4a5 100644 --- a/core/src/main/java/org/apache/iceberg/ContentFileParser.java +++ b/core/src/main/java/org/apache/iceberg/ContentFileParser.java @@ -52,6 +52,11 @@ private static boolean hasPartitionData(StructLike partitionData) { return partitionData != null && partitionData.size() > 0; } + static String toJson(ContentFile contentFile, PartitionSpec spec) { + return JsonUtil.generate( + generator -> ContentFileParser.toJson(contentFile, spec, generator), false); + } + static void toJson(ContentFile contentFile, PartitionSpec spec, JsonGenerator generator) throws IOException { Preconditions.checkArgument(contentFile != null, "Content file cannot be null"); @@ -147,6 +152,7 @@ static ContentFile fromJson(JsonNode jsonNode, PartitionSpec spec) { Preconditions.checkArgument(jsonNode != null, "Cannot parse content file from null JSON node"); Preconditions.checkArgument( jsonNode.isObject(), "Cannot parse content file from a non-object: %s", jsonNode); + Preconditions.checkArgument(spec != null, "Partition spec cannot be null"); int specId = JsonUtil.getInt(SPEC_ID, jsonNode); FileContent fileContent = FileContent.valueOf(JsonUtil.getString(CONTENT, jsonNode)); diff --git a/core/src/main/java/org/apache/iceberg/FileScanTaskParser.java b/core/src/main/java/org/apache/iceberg/FileScanTaskParser.java index 42a1d15caa92..dce5f78cd95b 100644 --- a/core/src/main/java/org/apache/iceberg/FileScanTaskParser.java +++ b/core/src/main/java/org/apache/iceberg/FileScanTaskParser.java @@ -21,7 +21,6 @@ import com.fasterxml.jackson.core.JsonGenerator; import com.fasterxml.jackson.databind.JsonNode; import java.io.IOException; -import java.io.StringWriter; import java.io.UncheckedIOException; import org.apache.iceberg.expressions.Expression; import org.apache.iceberg.expressions.ExpressionParser; @@ -41,19 +40,13 @@ public class FileScanTaskParser { private FileScanTaskParser() {} public static String toJson(FileScanTask fileScanTask) { - Preconditions.checkArgument(fileScanTask != null, "File scan task cannot be null"); - try (StringWriter writer = new StringWriter()) { - JsonGenerator generator = JsonUtil.factory().createGenerator(writer); - toJson(fileScanTask, generator); - generator.flush(); - return writer.toString(); - } catch (IOException e) { - throw new UncheckedIOException("Failed to write json for: " + fileScanTask, e); - } + return JsonUtil.generate( + generator -> FileScanTaskParser.toJson(fileScanTask, generator), false); } private static void toJson(FileScanTask fileScanTask, JsonGenerator generator) throws IOException { + Preconditions.checkArgument(fileScanTask != null, "File scan task cannot be null"); generator.writeStartObject(); generator.writeFieldName(SCHEMA); diff --git a/core/src/test/java/org/apache/iceberg/TestContentFileParser.java b/core/src/test/java/org/apache/iceberg/TestContentFileParser.java index 4238757f62cf..8f32b4904d15 100644 --- a/core/src/test/java/org/apache/iceberg/TestContentFileParser.java +++ b/core/src/test/java/org/apache/iceberg/TestContentFileParser.java @@ -18,9 +18,7 @@ */ package org.apache.iceberg; -import com.fasterxml.jackson.core.JsonGenerator; import com.fasterxml.jackson.databind.JsonNode; -import java.io.StringWriter; import java.nio.ByteBuffer; import java.util.Arrays; import java.util.Collections; @@ -32,7 +30,7 @@ import org.apache.iceberg.util.JsonUtil; import org.assertj.core.api.Assertions; import org.junit.Assert; -import org.junit.Test; +import org.junit.jupiter.api.Test; import org.junit.jupiter.params.ParameterizedTest; import org.junit.jupiter.params.provider.Arguments; import org.junit.jupiter.params.provider.MethodSource; @@ -40,17 +38,48 @@ public class TestContentFileParser { @Test public void testNullArguments() throws Exception { - StringWriter stringWriter = new StringWriter(); - JsonGenerator jsonStringGenerator = JsonUtil.factory().createGenerator(stringWriter); + Assertions.assertThatThrownBy(() -> ContentFileParser.toJson(null, TableTestBase.SPEC)) + .isInstanceOf(IllegalArgumentException.class) + .hasMessage("Content file cannot be null"); + + Assertions.assertThatThrownBy(() -> ContentFileParser.toJson(TableTestBase.FILE_A, null)) + .isInstanceOf(IllegalArgumentException.class) + .hasMessage("Partition spec cannot be null"); Assertions.assertThatThrownBy( - () -> ContentFileParser.toJson(null, TableTestBase.SPEC, jsonStringGenerator)) + () -> ContentFileParser.toJson(TableTestBase.FILE_A, TableTestBase.SPEC, null)) .isInstanceOf(IllegalArgumentException.class) - .hasMessage("Content file cannot be null"); + .hasMessage("JSON generator cannot be null"); Assertions.assertThatThrownBy(() -> ContentFileParser.fromJson(null, TableTestBase.SPEC)) .isInstanceOf(IllegalArgumentException.class) .hasMessage("Cannot parse content file from null JSON node"); + + String jsonStr = ContentFileParser.toJson(TableTestBase.FILE_A, TableTestBase.SPEC); + JsonNode jsonNode = JsonUtil.mapper().readTree(jsonStr); + Assertions.assertThatThrownBy(() -> ContentFileParser.fromJson(jsonNode, null)) + .isInstanceOf(IllegalArgumentException.class) + .hasMessage("Partition spec cannot be null"); + } + + @ParameterizedTest + @MethodSource("provideSpecAndDataFile") + public void testDataFile(PartitionSpec spec, DataFile dataFile) throws Exception { + String jsonStr = ContentFileParser.toJson(dataFile, spec); + JsonNode jsonNode = JsonUtil.mapper().readTree(jsonStr); + ContentFile deserializedContentFile = ContentFileParser.fromJson(jsonNode, spec); + Assertions.assertThat(deserializedContentFile).isInstanceOf(DataFile.class); + assertContentFileEquals(dataFile, deserializedContentFile, spec); + } + + @ParameterizedTest + @MethodSource("provideSpecAndDeleteFile") + public void testDeleteFile(PartitionSpec spec, DeleteFile deleteFile) throws Exception { + String jsonStr = ContentFileParser.toJson(deleteFile, spec); + JsonNode jsonNode = JsonUtil.mapper().readTree(jsonStr); + ContentFile deserializedContentFile = ContentFileParser.fromJson(jsonNode, spec); + Assertions.assertThat(deserializedContentFile).isInstanceOf(DeleteFile.class); + assertContentFileEquals(deleteFile, deserializedContentFile, spec); } private static Stream provideSpecAndDataFile() { @@ -188,36 +217,6 @@ private static DeleteFile deleteFileWithAllOptional(PartitionSpec spec) { ByteBuffer.wrap(new byte[16])); } - @ParameterizedTest - @MethodSource("provideSpecAndDataFile") - public void testDataFile(PartitionSpec spec, DataFile dataFile) throws Exception { - StringWriter stringWriter = new StringWriter(); - JsonGenerator jsonStringGenerator = JsonUtil.factory().createGenerator(stringWriter); - - ContentFileParser.toJson(dataFile, spec, jsonStringGenerator); - jsonStringGenerator.close(); - String jsonStr = stringWriter.toString(); - - JsonNode jsonNode = JsonUtil.mapper().readTree(jsonStr); - DataFile deserializedFile = (DataFile) ContentFileParser.fromJson(jsonNode, spec); - assertContentFileEquals(dataFile, deserializedFile, spec); - } - - @ParameterizedTest - @MethodSource("provideSpecAndDeleteFile") - public void testDeleteFile(PartitionSpec spec, DeleteFile deleteFile) throws Exception { - StringWriter stringWriter = new StringWriter(); - JsonGenerator jsonStringGenerator = JsonUtil.factory().createGenerator(stringWriter); - - ContentFileParser.toJson(deleteFile, spec, jsonStringGenerator); - jsonStringGenerator.close(); - String jsonStr = stringWriter.toString(); - - JsonNode jsonNode = JsonUtil.mapper().readTree(jsonStr); - DeleteFile deserializedFile = (DeleteFile) ContentFileParser.fromJson(jsonNode, spec); - assertContentFileEquals(deleteFile, deserializedFile, spec); - } - static void assertContentFileEquals( ContentFile expected, ContentFile actual, PartitionSpec spec) { Assert.assertEquals(expected.getClass(), actual.getClass()); diff --git a/core/src/test/java/org/apache/iceberg/TestFileScanTaskParser.java b/core/src/test/java/org/apache/iceberg/TestFileScanTaskParser.java index c65e0983de43..c09a8f5bab94 100644 --- a/core/src/test/java/org/apache/iceberg/TestFileScanTaskParser.java +++ b/core/src/test/java/org/apache/iceberg/TestFileScanTaskParser.java @@ -18,16 +18,14 @@ */ package org.apache.iceberg; -import java.util.stream.Stream; import org.apache.iceberg.expressions.ExpressionUtil; import org.apache.iceberg.expressions.Expressions; import org.apache.iceberg.expressions.ResidualEvaluator; import org.assertj.core.api.Assertions; import org.junit.Assert; -import org.junit.Test; +import org.junit.jupiter.api.Test; import org.junit.jupiter.params.ParameterizedTest; -import org.junit.jupiter.params.provider.Arguments; -import org.junit.jupiter.params.provider.MethodSource; +import org.junit.jupiter.params.provider.ValueSource; public class TestFileScanTaskParser { @Test @@ -41,12 +39,8 @@ public void testNullArguments() { .hasMessage("Cannot parse file scan task from null JSON string"); } - private static Stream provideArgs() { - return Stream.of(Arguments.of(true), Arguments.of(false)); - } - @ParameterizedTest - @MethodSource("provideArgs") + @ValueSource(booleans = {true, false}) public void testParser(boolean caseSensitive) { PartitionSpec spec = TableTestBase.SPEC; FileScanTask fileScanTask = createScanTask(spec, caseSensitive); @@ -74,7 +68,7 @@ private FileScanTask createScanTask(PartitionSpec spec, boolean caseSensitive) { private static void assertFileScanTaskEquals( FileScanTask expected, FileScanTask actual, PartitionSpec spec, boolean caseSensitive) { TestContentFileParser.assertContentFileEquals(expected.file(), actual.file(), spec); - Assert.assertEquals(expected.deletes().size(), actual.deletes().size()); + Assertions.assertThat(actual.deletes()).hasSameSizeAs(expected.deletes()); for (int pos = 0; pos < expected.deletes().size(); ++pos) { TestContentFileParser.assertContentFileEquals( expected.deletes().get(pos), actual.deletes().get(pos), spec); From d5cddf883321ae616d350a350d2e0cfbd568480c Mon Sep 17 00:00:00 2001 From: Steven Wu Date: Tue, 4 Apr 2023 21:33:18 -0700 Subject: [PATCH 06/12] fix compiling error after rebase with master --- core/src/main/java/org/apache/iceberg/ContentFileParser.java | 1 + core/src/main/java/org/apache/iceberg/DataFiles.java | 2 +- .../src/test/java/org/apache/iceberg/TestContentFileParser.java | 2 ++ 3 files changed, 4 insertions(+), 1 deletion(-) diff --git a/core/src/main/java/org/apache/iceberg/ContentFileParser.java b/core/src/main/java/org/apache/iceberg/ContentFileParser.java index 6c05ca5cc4a5..ff5ba919bedb 100644 --- a/core/src/main/java/org/apache/iceberg/ContentFileParser.java +++ b/core/src/main/java/org/apache/iceberg/ContentFileParser.java @@ -205,6 +205,7 @@ static ContentFile fromJson(JsonNode jsonNode, PartitionSpec spec) { metrics, equalityFieldIds, sortOrderId, + splitOffsets, keyMetadata); } } diff --git a/core/src/main/java/org/apache/iceberg/DataFiles.java b/core/src/main/java/org/apache/iceberg/DataFiles.java index 7d6c34d00bb9..95b2891c98bc 100644 --- a/core/src/main/java/org/apache/iceberg/DataFiles.java +++ b/core/src/main/java/org/apache/iceberg/DataFiles.java @@ -273,7 +273,7 @@ public Builder withSplitOffsets(List offsets) { public Builder withEqualityFieldIds(List equalityIds) { if (equalityIds != null) { - this.equalityFieldIds = copyList(equalityIds); + this.equalityFieldIds = ImmutableList.copyOf(equalityIds); } return this; diff --git a/core/src/test/java/org/apache/iceberg/TestContentFileParser.java b/core/src/test/java/org/apache/iceberg/TestContentFileParser.java index 8f32b4904d15..8d42968a292f 100644 --- a/core/src/test/java/org/apache/iceberg/TestContentFileParser.java +++ b/core/src/test/java/org/apache/iceberg/TestContentFileParser.java @@ -176,6 +176,7 @@ private static DeleteFile deleteFileWithRequiredOnly(PartitionSpec spec) { new Metrics(9L, null, null, null, null), null, null, + null, null); } @@ -214,6 +215,7 @@ private static DeleteFile deleteFileWithAllOptional(PartitionSpec spec) { metrics, new int[] {3}, 1, + Arrays.asList(128L), ByteBuffer.wrap(new byte[16])); } From f4f432064cb5632cc0c8730407e22c6ab6fe1f66 Mon Sep 17 00:00:00 2001 From: Steven Wu Date: Tue, 18 Apr 2023 09:23:49 -0700 Subject: [PATCH 07/12] address comments from Eduard that were missed earlier --- .../org/apache/iceberg/ContentFileParser.java | 87 ++++++++++--------- .../apache/iceberg/FileScanTaskParser.java | 20 ++--- .../apache/iceberg/TestContentFileParser.java | 52 ++++++----- .../iceberg/TestFileScanTaskParser.java | 25 +++--- 4 files changed, 94 insertions(+), 90 deletions(-) diff --git a/core/src/main/java/org/apache/iceberg/ContentFileParser.java b/core/src/main/java/org/apache/iceberg/ContentFileParser.java index ff5ba919bedb..286fa622f06d 100644 --- a/core/src/main/java/org/apache/iceberg/ContentFileParser.java +++ b/core/src/main/java/org/apache/iceberg/ContentFileParser.java @@ -59,9 +59,9 @@ static String toJson(ContentFile contentFile, PartitionSpec spec) { static void toJson(ContentFile contentFile, PartitionSpec spec, JsonGenerator generator) throws IOException { - Preconditions.checkArgument(contentFile != null, "Content file cannot be null"); - Preconditions.checkArgument(spec != null, "Partition spec cannot be null"); - Preconditions.checkArgument(generator != null, "JSON generator cannot be null"); + Preconditions.checkArgument(contentFile != null, "Invalid content file: null"); + Preconditions.checkArgument(spec != null, "Invalid partition spec: null"); + Preconditions.checkArgument(generator != null, "Invalid JSON generator: null"); generator.writeStartObject(); @@ -70,7 +70,7 @@ static void toJson(ContentFile contentFile, PartitionSpec spec, JsonGenerator Preconditions.checkArgument( contentFile.specId() == spec.specId(), - "Partition spec id mismatch: expected = %s, actual = %s", + "Invalid partition spec id from content file: expected = %s, actual = %s", spec.specId(), contentFile.specId()); generator.writeNumberField(SPEC_ID, contentFile.specId()); @@ -81,9 +81,9 @@ static void toJson(ContentFile contentFile, PartitionSpec spec, JsonGenerator Preconditions.checkArgument( spec.isPartitioned() == hasPartitionData(contentFile.partition()), - "Invalid data file: partition data (%s) doesn't match the expected (%s)", - hasPartitionData(contentFile.partition()), - spec.isPartitioned()); + "Invalid partition data from content file: expected = %s, actual = %s", + spec.isPartitioned() ? "partitioned" : "unpartitioned", + hasPartitionData(contentFile.partition()) ? "partitioned" : "unpartitioned"); if (contentFile.partition() != null) { generator.writeFieldName(PARTITION); SingleValueParser.toJson(spec.partitionType(), contentFile.partition(), generator); @@ -92,37 +92,7 @@ static void toJson(ContentFile contentFile, PartitionSpec spec, JsonGenerator generator.writeNumberField(RECORD_COUNT, contentFile.recordCount()); generator.writeNumberField(FILE_SIZE, contentFile.fileSizeInBytes()); - if (contentFile.columnSizes() != null) { - generator.writeFieldName(COLUMN_SIZES); - SingleValueParser.toJson(DataFile.COLUMN_SIZES.type(), contentFile.columnSizes(), generator); - } - - if (contentFile.valueCounts() != null) { - generator.writeFieldName(VALUE_COUNTS); - SingleValueParser.toJson(DataFile.VALUE_COUNTS.type(), contentFile.valueCounts(), generator); - } - - if (contentFile.nullValueCounts() != null) { - generator.writeFieldName(NULL_VALUE_COUNTS); - SingleValueParser.toJson( - DataFile.NULL_VALUE_COUNTS.type(), contentFile.nullValueCounts(), generator); - } - - if (contentFile.nullValueCounts() != null) { - generator.writeFieldName(NAN_VALUE_COUNTS); - SingleValueParser.toJson( - DataFile.NAN_VALUE_COUNTS.type(), contentFile.nanValueCounts(), generator); - } - - if (contentFile.lowerBounds() != null) { - generator.writeFieldName(LOWER_BOUNDS); - SingleValueParser.toJson(DataFile.LOWER_BOUNDS.type(), contentFile.lowerBounds(), generator); - } - - if (contentFile.upperBounds() != null) { - generator.writeFieldName(UPPER_BOUNDS); - SingleValueParser.toJson(DataFile.UPPER_BOUNDS.type(), contentFile.upperBounds(), generator); - } + metricsToJson(contentFile, generator); if (contentFile.keyMetadata() != null) { generator.writeFieldName(KEY_METADATA); @@ -149,10 +119,10 @@ static void toJson(ContentFile contentFile, PartitionSpec spec, JsonGenerator } static ContentFile fromJson(JsonNode jsonNode, PartitionSpec spec) { - Preconditions.checkArgument(jsonNode != null, "Cannot parse content file from null JSON node"); + Preconditions.checkArgument(jsonNode != null, "Invalid JSON node for content file: null"); Preconditions.checkArgument( - jsonNode.isObject(), "Cannot parse content file from a non-object: %s", jsonNode); - Preconditions.checkArgument(spec != null, "Partition spec cannot be null"); + jsonNode.isObject(), "Invalid JSON node for content file: non-object (%s)", jsonNode); + Preconditions.checkArgument(spec != null, "Invalid partition spec: null"); int specId = JsonUtil.getInt(SPEC_ID, jsonNode); FileContent fileContent = FileContent.valueOf(JsonUtil.getString(CONTENT, jsonNode)); @@ -210,6 +180,41 @@ static ContentFile fromJson(JsonNode jsonNode, PartitionSpec spec) { } } + private static void metricsToJson(ContentFile contentFile, JsonGenerator generator) + throws IOException { + if (contentFile.columnSizes() != null) { + generator.writeFieldName(COLUMN_SIZES); + SingleValueParser.toJson(DataFile.COLUMN_SIZES.type(), contentFile.columnSizes(), generator); + } + + if (contentFile.valueCounts() != null) { + generator.writeFieldName(VALUE_COUNTS); + SingleValueParser.toJson(DataFile.VALUE_COUNTS.type(), contentFile.valueCounts(), generator); + } + + if (contentFile.nullValueCounts() != null) { + generator.writeFieldName(NULL_VALUE_COUNTS); + SingleValueParser.toJson( + DataFile.NULL_VALUE_COUNTS.type(), contentFile.nullValueCounts(), generator); + } + + if (contentFile.nullValueCounts() != null) { + generator.writeFieldName(NAN_VALUE_COUNTS); + SingleValueParser.toJson( + DataFile.NAN_VALUE_COUNTS.type(), contentFile.nanValueCounts(), generator); + } + + if (contentFile.lowerBounds() != null) { + generator.writeFieldName(LOWER_BOUNDS); + SingleValueParser.toJson(DataFile.LOWER_BOUNDS.type(), contentFile.lowerBounds(), generator); + } + + if (contentFile.upperBounds() != null) { + generator.writeFieldName(UPPER_BOUNDS); + SingleValueParser.toJson(DataFile.UPPER_BOUNDS.type(), contentFile.upperBounds(), generator); + } + } + private static Metrics metricsFromJson(JsonNode jsonNode) { long recordCount = JsonUtil.getLong(RECORD_COUNT, jsonNode); diff --git a/core/src/main/java/org/apache/iceberg/FileScanTaskParser.java b/core/src/main/java/org/apache/iceberg/FileScanTaskParser.java index dce5f78cd95b..1ebb9cfde01d 100644 --- a/core/src/main/java/org/apache/iceberg/FileScanTaskParser.java +++ b/core/src/main/java/org/apache/iceberg/FileScanTaskParser.java @@ -21,7 +21,6 @@ import com.fasterxml.jackson.core.JsonGenerator; import com.fasterxml.jackson.databind.JsonNode; import java.io.IOException; -import java.io.UncheckedIOException; import org.apache.iceberg.expressions.Expression; import org.apache.iceberg.expressions.ExpressionParser; import org.apache.iceberg.expressions.Expressions; @@ -46,7 +45,8 @@ public static String toJson(FileScanTask fileScanTask) { private static void toJson(FileScanTask fileScanTask, JsonGenerator generator) throws IOException { - Preconditions.checkArgument(fileScanTask != null, "File scan task cannot be null"); + Preconditions.checkArgument(fileScanTask != null, "Invalid file scan task: null"); + Preconditions.checkArgument(generator != null, "Invalid JSON generator: null"); generator.writeStartObject(); generator.writeFieldName(SCHEMA); @@ -78,18 +78,14 @@ private static void toJson(FileScanTask fileScanTask, JsonGenerator generator) } public static FileScanTask fromJson(String json, boolean caseSensitive) { - Preconditions.checkArgument(json != null, "Cannot parse file scan task from null JSON string"); - try { - JsonNode jsonNode = JsonUtil.mapper().readValue(json, JsonNode.class); - return fromJsonNode(jsonNode, caseSensitive); - } catch (IOException e) { - throw new UncheckedIOException(e); - } + Preconditions.checkArgument(json != null, "Invalid JSON string for file scan task: null"); + return JsonUtil.parse(json, node -> FileScanTaskParser.fromJsonNode(node, caseSensitive)); } private static FileScanTask fromJsonNode(JsonNode jsonNode, boolean caseSensitive) { + Preconditions.checkArgument(jsonNode != null, "Invalid JSON node for file scan task: null"); Preconditions.checkArgument( - jsonNode.isObject(), "Cannot parse file scan task from a non-object: %s", jsonNode); + jsonNode.isObject(), "Invalid JSON node for file scan task: non-object (%s)", jsonNode); JsonNode schemaNode = jsonNode.get(SCHEMA); Schema schema = SchemaParser.fromJson(schemaNode); @@ -108,7 +104,9 @@ private static FileScanTask fromJsonNode(JsonNode jsonNode, boolean caseSensitiv if (jsonNode.has(DELETE_FILES)) { JsonNode deletesArray = jsonNode.get(DELETE_FILES); Preconditions.checkArgument( - deletesArray.isArray(), "Cannot parse delete files from a non-array: %s", deletesArray); + deletesArray.isArray(), + "Invalid JSON node for delete files: non-array (%s)", + deletesArray); // parse the schema array ImmutableList.Builder builder = ImmutableList.builder(); for (JsonNode deleteFileNode : deletesArray) { diff --git a/core/src/test/java/org/apache/iceberg/TestContentFileParser.java b/core/src/test/java/org/apache/iceberg/TestContentFileParser.java index 8d42968a292f..5cacbf830be0 100644 --- a/core/src/test/java/org/apache/iceberg/TestContentFileParser.java +++ b/core/src/test/java/org/apache/iceberg/TestContentFileParser.java @@ -29,7 +29,6 @@ import org.apache.iceberg.types.Types; import org.apache.iceberg.util.JsonUtil; import org.assertj.core.api.Assertions; -import org.junit.Assert; import org.junit.jupiter.api.Test; import org.junit.jupiter.params.ParameterizedTest; import org.junit.jupiter.params.provider.Arguments; @@ -40,26 +39,26 @@ public class TestContentFileParser { public void testNullArguments() throws Exception { Assertions.assertThatThrownBy(() -> ContentFileParser.toJson(null, TableTestBase.SPEC)) .isInstanceOf(IllegalArgumentException.class) - .hasMessage("Content file cannot be null"); + .hasMessage("Invalid content file: null"); Assertions.assertThatThrownBy(() -> ContentFileParser.toJson(TableTestBase.FILE_A, null)) .isInstanceOf(IllegalArgumentException.class) - .hasMessage("Partition spec cannot be null"); + .hasMessage("Invalid partition spec: null"); Assertions.assertThatThrownBy( () -> ContentFileParser.toJson(TableTestBase.FILE_A, TableTestBase.SPEC, null)) .isInstanceOf(IllegalArgumentException.class) - .hasMessage("JSON generator cannot be null"); + .hasMessage("Invalid JSON generator: null"); Assertions.assertThatThrownBy(() -> ContentFileParser.fromJson(null, TableTestBase.SPEC)) .isInstanceOf(IllegalArgumentException.class) - .hasMessage("Cannot parse content file from null JSON node"); + .hasMessage("Invalid JSON node for content file: null"); String jsonStr = ContentFileParser.toJson(TableTestBase.FILE_A, TableTestBase.SPEC); JsonNode jsonNode = JsonUtil.mapper().readTree(jsonStr); Assertions.assertThatThrownBy(() -> ContentFileParser.fromJson(jsonNode, null)) .isInstanceOf(IllegalArgumentException.class) - .hasMessage("Partition spec cannot be null"); + .hasMessage("Invalid partition spec: null"); } @ParameterizedTest @@ -221,26 +220,25 @@ private static DeleteFile deleteFileWithAllOptional(PartitionSpec spec) { static void assertContentFileEquals( ContentFile expected, ContentFile actual, PartitionSpec spec) { - Assert.assertEquals(expected.getClass(), actual.getClass()); - Assert.assertEquals(expected.specId(), actual.specId()); - Assert.assertEquals(expected.content(), actual.content()); - Assert.assertEquals(expected.path(), actual.path()); - Assert.assertEquals(expected.format(), actual.format()); - Assert.assertEquals( - 0, - Comparators.forType(spec.partitionType()) - .compare(expected.partition(), actual.partition())); - Assert.assertEquals(expected.recordCount(), actual.recordCount()); - Assert.assertEquals(expected.fileSizeInBytes(), actual.fileSizeInBytes()); - Assert.assertEquals(expected.columnSizes(), actual.columnSizes()); - Assert.assertEquals(expected.valueCounts(), actual.valueCounts()); - Assert.assertEquals(expected.nullValueCounts(), actual.nullValueCounts()); - Assert.assertEquals(expected.nanValueCounts(), actual.nanValueCounts()); - Assert.assertEquals(expected.lowerBounds(), actual.lowerBounds()); - Assert.assertEquals(expected.upperBounds(), actual.upperBounds()); - Assert.assertEquals(expected.keyMetadata(), actual.keyMetadata()); - Assert.assertEquals(expected.splitOffsets(), actual.splitOffsets()); - Assert.assertEquals(expected.equalityFieldIds(), actual.equalityFieldIds()); - Assert.assertEquals(expected.sortOrderId(), actual.sortOrderId()); + Assertions.assertThat(actual.getClass()).isEqualTo(expected.getClass()); + Assertions.assertThat(actual.specId()).isEqualTo(expected.specId()); + Assertions.assertThat(actual.content()).isEqualTo(expected.content()); + Assertions.assertThat(actual.path()).isEqualTo(expected.path()); + Assertions.assertThat(actual.format()).isEqualTo(expected.format()); + Assertions.assertThat(actual.partition()) + .usingComparator(Comparators.forType(spec.partitionType())) + .isEqualTo(expected.partition()); + Assertions.assertThat(actual.recordCount()).isEqualTo(expected.recordCount()); + Assertions.assertThat(actual.fileSizeInBytes()).isEqualTo(expected.fileSizeInBytes()); + Assertions.assertThat(actual.columnSizes()).isEqualTo(expected.columnSizes()); + Assertions.assertThat(actual.valueCounts()).isEqualTo(expected.valueCounts()); + Assertions.assertThat(actual.nullValueCounts()).isEqualTo(expected.nullValueCounts()); + Assertions.assertThat(actual.nanValueCounts()).isEqualTo(expected.nanValueCounts()); + Assertions.assertThat(actual.lowerBounds()).isEqualTo(expected.lowerBounds()); + Assertions.assertThat(actual.upperBounds()).isEqualTo(expected.upperBounds()); + Assertions.assertThat(actual.keyMetadata()).isEqualTo(expected.keyMetadata()); + Assertions.assertThat(actual.splitOffsets()).isEqualTo(expected.splitOffsets()); + Assertions.assertThat(actual.equalityFieldIds()).isEqualTo(expected.equalityFieldIds()); + Assertions.assertThat(actual.sortOrderId()).isEqualTo(expected.sortOrderId()); } } diff --git a/core/src/test/java/org/apache/iceberg/TestFileScanTaskParser.java b/core/src/test/java/org/apache/iceberg/TestFileScanTaskParser.java index c09a8f5bab94..d85bf0a00265 100644 --- a/core/src/test/java/org/apache/iceberg/TestFileScanTaskParser.java +++ b/core/src/test/java/org/apache/iceberg/TestFileScanTaskParser.java @@ -22,7 +22,6 @@ import org.apache.iceberg.expressions.Expressions; import org.apache.iceberg.expressions.ResidualEvaluator; import org.assertj.core.api.Assertions; -import org.junit.Assert; import org.junit.jupiter.api.Test; import org.junit.jupiter.params.ParameterizedTest; import org.junit.jupiter.params.provider.ValueSource; @@ -32,11 +31,11 @@ public class TestFileScanTaskParser { public void testNullArguments() { Assertions.assertThatThrownBy(() -> FileScanTaskParser.toJson(null)) .isInstanceOf(IllegalArgumentException.class) - .hasMessage("File scan task cannot be null"); + .hasMessage("Invalid file scan task: null"); Assertions.assertThatThrownBy(() -> FileScanTaskParser.fromJson(null, true)) .isInstanceOf(IllegalArgumentException.class) - .hasMessage("Cannot parse file scan task from null JSON string"); + .hasMessage("Invalid JSON string for file scan task: null"); } @ParameterizedTest @@ -74,13 +73,17 @@ private static void assertFileScanTaskEquals( expected.deletes().get(pos), actual.deletes().get(pos), spec); } - Assert.assertTrue("Schema should be the same", expected.schema().sameSchema(actual.schema())); - Assert.assertEquals(expected.spec(), actual.spec()); - Assert.assertTrue( - ExpressionUtil.equivalent( - expected.residual(), - actual.residual(), - TableTestBase.SCHEMA.asStruct(), - caseSensitive)); + Assertions.assertThat(expected.schema().sameSchema(actual.schema())) + .isTrue() + .as("Schema should match"); + Assertions.assertThat(actual.spec()).isEqualTo(expected.spec()); + Assertions.assertThat( + ExpressionUtil.equivalent( + expected.residual(), + actual.residual(), + TableTestBase.SCHEMA.asStruct(), + caseSensitive)) + .isTrue() + .as("Residual expression should match"); } } From a0c7ee30fe0ba3a59f5e57f87c222f522add60fd Mon Sep 17 00:00:00 2001 From: Steven Wu Date: Mon, 24 Apr 2023 11:23:22 -0700 Subject: [PATCH 08/12] Address latest round of comments from Eduard --- .../org/apache/iceberg/ContentFileParser.java | 32 ++++++++----------- .../apache/iceberg/FileScanTaskParser.java | 10 +++--- .../iceberg/TestFileScanTaskParser.java | 8 ++--- 3 files changed, 22 insertions(+), 28 deletions(-) diff --git a/core/src/main/java/org/apache/iceberg/ContentFileParser.java b/core/src/main/java/org/apache/iceberg/ContentFileParser.java index 286fa622f06d..b3edf2927fbc 100644 --- a/core/src/main/java/org/apache/iceberg/ContentFileParser.java +++ b/core/src/main/java/org/apache/iceberg/ContentFileParser.java @@ -62,34 +62,32 @@ static void toJson(ContentFile contentFile, PartitionSpec spec, JsonGenerator Preconditions.checkArgument(contentFile != null, "Invalid content file: null"); Preconditions.checkArgument(spec != null, "Invalid partition spec: null"); Preconditions.checkArgument(generator != null, "Invalid JSON generator: null"); + Preconditions.checkArgument( + contentFile.specId() == spec.specId(), + "Invalid partition spec id from content file: expected = %s, actual = %s", + spec.specId(), + contentFile.specId()); + Preconditions.checkArgument( + spec.isPartitioned() == hasPartitionData(contentFile.partition()), + "Invalid partition data from content file: expected = %s, actual = %s", + spec.isPartitioned() ? "partitioned" : "unpartitioned", + hasPartitionData(contentFile.partition()) ? "partitioned" : "unpartitioned"); generator.writeStartObject(); // ignore the ordinal position (ContentFile#pos) of the file in a manifest, // as it isn't used and BaseFile constructor doesn't support it. - Preconditions.checkArgument( - contentFile.specId() == spec.specId(), - "Invalid partition spec id from content file: expected = %s, actual = %s", - spec.specId(), - contentFile.specId()); generator.writeNumberField(SPEC_ID, contentFile.specId()); - generator.writeStringField(CONTENT, contentFile.content().name()); generator.writeStringField(FILE_PATH, contentFile.path().toString()); generator.writeStringField(FILE_FORMAT, contentFile.format().name()); - Preconditions.checkArgument( - spec.isPartitioned() == hasPartitionData(contentFile.partition()), - "Invalid partition data from content file: expected = %s, actual = %s", - spec.isPartitioned() ? "partitioned" : "unpartitioned", - hasPartitionData(contentFile.partition()) ? "partitioned" : "unpartitioned"); if (contentFile.partition() != null) { generator.writeFieldName(PARTITION); SingleValueParser.toJson(spec.partitionType(), contentFile.partition(), generator); } - generator.writeNumberField(RECORD_COUNT, contentFile.recordCount()); generator.writeNumberField(FILE_SIZE, contentFile.fileSizeInBytes()); metricsToJson(contentFile, generator); @@ -100,15 +98,11 @@ static void toJson(ContentFile contentFile, PartitionSpec spec, JsonGenerator } if (contentFile.splitOffsets() != null) { - generator.writeFieldName(SPLIT_OFFSETS); - SingleValueParser.toJson( - DataFile.SPLIT_OFFSETS.type(), contentFile.splitOffsets(), generator); + JsonUtil.writeLongArray(SPLIT_OFFSETS, contentFile.splitOffsets(), generator); } if (contentFile.equalityFieldIds() != null) { - generator.writeFieldName(EQUALITY_IDS); - SingleValueParser.toJson( - DataFile.EQUALITY_IDS.type(), contentFile.equalityFieldIds(), generator); + JsonUtil.writeIntegerArray(EQUALITY_IDS, contentFile.equalityFieldIds(), generator); } if (contentFile.sortOrderId() != null) { @@ -182,6 +176,8 @@ static ContentFile fromJson(JsonNode jsonNode, PartitionSpec spec) { private static void metricsToJson(ContentFile contentFile, JsonGenerator generator) throws IOException { + generator.writeNumberField(RECORD_COUNT, contentFile.recordCount()); + if (contentFile.columnSizes() != null) { generator.writeFieldName(COLUMN_SIZES); SingleValueParser.toJson(DataFile.COLUMN_SIZES.type(), contentFile.columnSizes(), generator); diff --git a/core/src/main/java/org/apache/iceberg/FileScanTaskParser.java b/core/src/main/java/org/apache/iceberg/FileScanTaskParser.java index 1ebb9cfde01d..b747eff98b7c 100644 --- a/core/src/main/java/org/apache/iceberg/FileScanTaskParser.java +++ b/core/src/main/java/org/apache/iceberg/FileScanTaskParser.java @@ -79,20 +79,18 @@ private static void toJson(FileScanTask fileScanTask, JsonGenerator generator) public static FileScanTask fromJson(String json, boolean caseSensitive) { Preconditions.checkArgument(json != null, "Invalid JSON string for file scan task: null"); - return JsonUtil.parse(json, node -> FileScanTaskParser.fromJsonNode(node, caseSensitive)); + return JsonUtil.parse(json, node -> FileScanTaskParser.fromJson(node, caseSensitive)); } - private static FileScanTask fromJsonNode(JsonNode jsonNode, boolean caseSensitive) { + private static FileScanTask fromJson(JsonNode jsonNode, boolean caseSensitive) { Preconditions.checkArgument(jsonNode != null, "Invalid JSON node for file scan task: null"); Preconditions.checkArgument( jsonNode.isObject(), "Invalid JSON node for file scan task: non-object (%s)", jsonNode); - JsonNode schemaNode = jsonNode.get(SCHEMA); - Schema schema = SchemaParser.fromJson(schemaNode); + Schema schema = SchemaParser.fromJson(JsonUtil.get(SCHEMA, jsonNode)); String schemaString = SchemaParser.toJson(schema); - JsonNode specNode = jsonNode.get(SPEC); - PartitionSpec spec = PartitionSpecParser.fromJson(schema, specNode); + PartitionSpec spec = PartitionSpecParser.fromJson(schema, JsonUtil.get(SPEC, jsonNode)); String specString = PartitionSpecParser.toJson(spec); DataFile dataFile = null; diff --git a/core/src/test/java/org/apache/iceberg/TestFileScanTaskParser.java b/core/src/test/java/org/apache/iceberg/TestFileScanTaskParser.java index d85bf0a00265..f2844934da2f 100644 --- a/core/src/test/java/org/apache/iceberg/TestFileScanTaskParser.java +++ b/core/src/test/java/org/apache/iceberg/TestFileScanTaskParser.java @@ -74,8 +74,8 @@ private static void assertFileScanTaskEquals( } Assertions.assertThat(expected.schema().sameSchema(actual.schema())) - .isTrue() - .as("Schema should match"); + .as("Schema should match") + .isTrue(); Assertions.assertThat(actual.spec()).isEqualTo(expected.spec()); Assertions.assertThat( ExpressionUtil.equivalent( @@ -83,7 +83,7 @@ private static void assertFileScanTaskEquals( actual.residual(), TableTestBase.SCHEMA.asStruct(), caseSensitive)) - .isTrue() - .as("Residual expression should match"); + .as("Residual expression should match") + .isTrue(); } } From 7daa50276a689d861546a61fcdea17316c97788e Mon Sep 17 00:00:00 2001 From: Steven Wu Date: Wed, 3 May 2023 21:49:13 -0700 Subject: [PATCH 09/12] address Eduard's comments on avoiding SingleValueParser and adding expected json string --- .../org/apache/iceberg/util/JsonUtil.java | 10 +- .../apache/iceberg/TestContentFileParser.java | 114 ++++++++++++++++-- .../iceberg/TestFileScanTaskParser.java | 19 +++ .../org/apache/iceberg/util/TestJsonUtil.java | 2 +- 4 files changed, 131 insertions(+), 14 deletions(-) diff --git a/core/src/main/java/org/apache/iceberg/util/JsonUtil.java b/core/src/main/java/org/apache/iceberg/util/JsonUtil.java index 896e777d9630..d312e0983be0 100644 --- a/core/src/main/java/org/apache/iceberg/util/JsonUtil.java +++ b/core/src/main/java/org/apache/iceberg/util/JsonUtil.java @@ -29,14 +29,14 @@ import java.nio.ByteBuffer; import java.util.Iterator; import java.util.List; +import java.util.Locale; import java.util.Map; import java.util.Set; -import org.apache.iceberg.SingleValueParser; import org.apache.iceberg.relocated.com.google.common.base.Preconditions; import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; import org.apache.iceberg.relocated.com.google.common.collect.ImmutableSet; -import org.apache.iceberg.types.Types; +import org.apache.iceberg.relocated.com.google.common.io.BaseEncoding; public class JsonUtil { @@ -181,7 +181,11 @@ public static ByteBuffer getByteBufferOrNull(String property, JsonNode node) { return null; } - return (ByteBuffer) SingleValueParser.fromJson(Types.BinaryType.get(), node.get(property)); + JsonNode pNode = node.get(property); + Preconditions.checkArgument( + pNode.isTextual(), "Cannot parse from non-text value: %s: %s", property, pNode); + return ByteBuffer.wrap( + BaseEncoding.base16().decode(pNode.textValue().toUpperCase(Locale.ROOT))); } public static Map getStringMap(String property, JsonNode node) { diff --git a/core/src/test/java/org/apache/iceberg/TestContentFileParser.java b/core/src/test/java/org/apache/iceberg/TestContentFileParser.java index 5cacbf830be0..9360f571c5bb 100644 --- a/core/src/test/java/org/apache/iceberg/TestContentFileParser.java +++ b/core/src/test/java/org/apache/iceberg/TestContentFileParser.java @@ -63,8 +63,10 @@ public void testNullArguments() throws Exception { @ParameterizedTest @MethodSource("provideSpecAndDataFile") - public void testDataFile(PartitionSpec spec, DataFile dataFile) throws Exception { + public void testDataFile(PartitionSpec spec, DataFile dataFile, String expectedJson) + throws Exception { String jsonStr = ContentFileParser.toJson(dataFile, spec); + Assertions.assertThat(jsonStr).isEqualTo(expectedJson); JsonNode jsonNode = JsonUtil.mapper().readTree(jsonStr); ContentFile deserializedContentFile = ContentFileParser.fromJson(jsonNode, spec); Assertions.assertThat(deserializedContentFile).isInstanceOf(DataFile.class); @@ -73,8 +75,10 @@ public void testDataFile(PartitionSpec spec, DataFile dataFile) throws Exception @ParameterizedTest @MethodSource("provideSpecAndDeleteFile") - public void testDeleteFile(PartitionSpec spec, DeleteFile deleteFile) throws Exception { + public void testDeleteFile(PartitionSpec spec, DeleteFile deleteFile, String expectedJson) + throws Exception { String jsonStr = ContentFileParser.toJson(deleteFile, spec); + Assertions.assertThat(jsonStr).isEqualTo(expectedJson); JsonNode jsonNode = JsonUtil.mapper().readTree(jsonStr); ContentFile deserializedContentFile = ContentFileParser.fromJson(jsonNode, spec); Assertions.assertThat(deserializedContentFile).isInstanceOf(DeleteFile.class); @@ -84,11 +88,21 @@ public void testDeleteFile(PartitionSpec spec, DeleteFile deleteFile) throws Exc private static Stream provideSpecAndDataFile() { return Stream.of( Arguments.of( - PartitionSpec.unpartitioned(), dataFileWithRequiredOnly(PartitionSpec.unpartitioned())), + PartitionSpec.unpartitioned(), + dataFileWithRequiredOnly(PartitionSpec.unpartitioned()), + dataFileJsonWithRequiredOnly(PartitionSpec.unpartitioned())), + Arguments.of( + PartitionSpec.unpartitioned(), + dataFileWithAllOptional(PartitionSpec.unpartitioned()), + dataFileJsonWithAllOptional(PartitionSpec.unpartitioned())), Arguments.of( - PartitionSpec.unpartitioned(), dataFileWithAllOptional(PartitionSpec.unpartitioned())), - Arguments.of(TableTestBase.SPEC, dataFileWithRequiredOnly(TableTestBase.SPEC)), - Arguments.of(TableTestBase.SPEC, dataFileWithAllOptional(TableTestBase.SPEC))); + TableTestBase.SPEC, + dataFileWithRequiredOnly(TableTestBase.SPEC), + dataFileJsonWithRequiredOnly(TableTestBase.SPEC)), + Arguments.of( + TableTestBase.SPEC, + dataFileWithAllOptional(TableTestBase.SPEC), + dataFileJsonWithAllOptional(TableTestBase.SPEC))); } private static DataFile dataFileWithRequiredOnly(PartitionSpec spec) { @@ -106,6 +120,42 @@ private static DataFile dataFileWithRequiredOnly(PartitionSpec spec) { return builder.build(); } + private static String dataFileJsonWithRequiredOnly(PartitionSpec spec) { + if (spec.isUnpartitioned()) { + return "{\"spec-id\":0,\"content\":\"DATA\",\"file-path\":\"/path/to/data-a.parquet\",\"file-format\":\"PARQUET\"," + + "\"partition\":{},\"file-size-in-bytes\":10,\"record-count\":1,\"sort-order-id\":0}"; + } else { + return "{\"spec-id\":0,\"content\":\"DATA\",\"file-path\":\"/path/to/data-a.parquet\",\"file-format\":\"PARQUET\"," + + "\"partition\":{\"1000\":1},\"file-size-in-bytes\":10,\"record-count\":1,\"sort-order-id\":0}"; + } + } + + private static String dataFileJsonWithAllOptional(PartitionSpec spec) { + if (spec.isUnpartitioned()) { + return "{\"spec-id\":0,\"content\":\"DATA\",\"file-path\":\"/path/to/data-with-stats.parquet\"," + + "\"file-format\":\"PARQUET\",\"partition\":{},\"file-size-in-bytes\":350,\"record-count\":10," + + "\"column-sizes\":{\"keys\":[3,4],\"values\":[100,200]}," + + "\"value-counts\":{\"keys\":[3,4],\"values\":[90,180]}," + + "\"null-value-counts\":{\"keys\":[3,4],\"values\":[10,20]}," + + "\"nan-value-counts\":{\"keys\":[3,4],\"values\":[0,0]}," + + "\"lower-bounds\":{\"keys\":[3,4],\"values\":[\"01000000\",\"02000000\"]}," + + "\"upper-bounds\":{\"keys\":[3,4],\"values\":[\"05000000\",\"0A000000\"]}," + + "\"key-metadata\":\"00000000000000000000000000000000\"," + + "\"split-offsets\":[128,256],\"equality-ids\":[1],\"sort-order-id\":1}"; + } else { + return "{\"spec-id\":0,\"content\":\"DATA\",\"file-path\":\"/path/to/data-with-stats.parquet\"," + + "\"file-format\":\"PARQUET\",\"partition\":{\"1000\":1},\"file-size-in-bytes\":350,\"record-count\":10," + + "\"column-sizes\":{\"keys\":[3,4],\"values\":[100,200]}," + + "\"value-counts\":{\"keys\":[3,4],\"values\":[90,180]}," + + "\"null-value-counts\":{\"keys\":[3,4],\"values\":[10,20]}," + + "\"nan-value-counts\":{\"keys\":[3,4],\"values\":[0,0]}," + + "\"lower-bounds\":{\"keys\":[3,4],\"values\":[\"01000000\",\"02000000\"]}," + + "\"upper-bounds\":{\"keys\":[3,4],\"values\":[\"05000000\",\"0A000000\"]}," + + "\"key-metadata\":\"00000000000000000000000000000000\"," + + "\"split-offsets\":[128,256],\"equality-ids\":[1],\"sort-order-id\":1}"; + } + } + private static DataFile dataFileWithAllOptional(PartitionSpec spec) { DataFiles.Builder builder = DataFiles.builder(spec) @@ -150,12 +200,20 @@ private static Stream provideSpecAndDeleteFile() { return Stream.of( Arguments.of( PartitionSpec.unpartitioned(), - deleteFileWithRequiredOnly(PartitionSpec.unpartitioned())), + deleteFileWithRequiredOnly(PartitionSpec.unpartitioned()), + deleteFileJsonWithRequiredOnly(PartitionSpec.unpartitioned())), Arguments.of( PartitionSpec.unpartitioned(), - deleteFileWithAllOptional(PartitionSpec.unpartitioned())), - Arguments.of(TableTestBase.SPEC, deleteFileWithRequiredOnly(TableTestBase.SPEC)), - Arguments.of(TableTestBase.SPEC, deleteFileWithAllOptional(TableTestBase.SPEC))); + deleteFileWithAllOptional(PartitionSpec.unpartitioned()), + deleteFileJsonWithAllOptional(PartitionSpec.unpartitioned())), + Arguments.of( + TableTestBase.SPEC, + deleteFileWithRequiredOnly(TableTestBase.SPEC), + deleteFileJsonWithRequiredOnly(TableTestBase.SPEC)), + Arguments.of( + TableTestBase.SPEC, + deleteFileWithAllOptional(TableTestBase.SPEC), + deleteFileJsonWithAllOptional(TableTestBase.SPEC))); } private static DeleteFile deleteFileWithRequiredOnly(PartitionSpec spec) { @@ -218,6 +276,42 @@ private static DeleteFile deleteFileWithAllOptional(PartitionSpec spec) { ByteBuffer.wrap(new byte[16])); } + private static String deleteFileJsonWithRequiredOnly(PartitionSpec spec) { + if (spec.isUnpartitioned()) { + return "{\"spec-id\":0,\"content\":\"POSITION_DELETES\",\"file-path\":\"/path/to/delete-a.parquet\"," + + "\"file-format\":\"PARQUET\",\"partition\":{},\"file-size-in-bytes\":1234,\"record-count\":9}"; + } else { + return "{\"spec-id\":0,\"content\":\"POSITION_DELETES\",\"file-path\":\"/path/to/delete-a.parquet\"," + + "\"file-format\":\"PARQUET\",\"partition\":{\"1000\":9},\"file-size-in-bytes\":1234,\"record-count\":9}"; + } + } + + private static String deleteFileJsonWithAllOptional(PartitionSpec spec) { + if (spec.isUnpartitioned()) { + return "{\"spec-id\":0,\"content\":\"EQUALITY_DELETES\",\"file-path\":\"/path/to/delete-with-stats.parquet\"," + + "\"file-format\":\"PARQUET\",\"partition\":{},\"file-size-in-bytes\":1234,\"record-count\":10," + + "\"column-sizes\":{\"keys\":[3,4],\"values\":[100,200]}," + + "\"value-counts\":{\"keys\":[3,4],\"values\":[90,180]}," + + "\"null-value-counts\":{\"keys\":[3,4],\"values\":[10,20]}," + + "\"nan-value-counts\":{\"keys\":[3,4],\"values\":[0,0]}," + + "\"lower-bounds\":{\"keys\":[3,4],\"values\":[\"01000000\",\"02000000\"]}," + + "\"upper-bounds\":{\"keys\":[3,4],\"values\":[\"05000000\",\"0A000000\"]}," + + "\"key-metadata\":\"00000000000000000000000000000000\"," + + "\"split-offsets\":[128],\"equality-ids\":[3],\"sort-order-id\":1}"; + } else { + return "{\"spec-id\":0,\"content\":\"EQUALITY_DELETES\",\"file-path\":\"/path/to/delete-with-stats.parquet\"," + + "\"file-format\":\"PARQUET\",\"partition\":{\"1000\":9},\"file-size-in-bytes\":1234,\"record-count\":10," + + "\"column-sizes\":{\"keys\":[3,4],\"values\":[100,200]}," + + "\"value-counts\":{\"keys\":[3,4],\"values\":[90,180]}," + + "\"null-value-counts\":{\"keys\":[3,4],\"values\":[10,20]}," + + "\"nan-value-counts\":{\"keys\":[3,4],\"values\":[0,0]}," + + "\"lower-bounds\":{\"keys\":[3,4],\"values\":[\"01000000\",\"02000000\"]}," + + "\"upper-bounds\":{\"keys\":[3,4],\"values\":[\"05000000\",\"0A000000\"]}," + + "\"key-metadata\":\"00000000000000000000000000000000\"," + + "\"split-offsets\":[128],\"equality-ids\":[3],\"sort-order-id\":1}"; + } + } + static void assertContentFileEquals( ContentFile expected, ContentFile actual, PartitionSpec spec) { Assertions.assertThat(actual.getClass()).isEqualTo(expected.getClass()); diff --git a/core/src/test/java/org/apache/iceberg/TestFileScanTaskParser.java b/core/src/test/java/org/apache/iceberg/TestFileScanTaskParser.java index f2844934da2f..221a5507b1c6 100644 --- a/core/src/test/java/org/apache/iceberg/TestFileScanTaskParser.java +++ b/core/src/test/java/org/apache/iceberg/TestFileScanTaskParser.java @@ -44,6 +44,7 @@ public void testParser(boolean caseSensitive) { PartitionSpec spec = TableTestBase.SPEC; FileScanTask fileScanTask = createScanTask(spec, caseSensitive); String jsonStr = FileScanTaskParser.toJson(fileScanTask); + Assertions.assertThat(jsonStr).isEqualTo(expectedFileScanTaskJson()); FileScanTask deserializedTask = FileScanTaskParser.fromJson(jsonStr, caseSensitive); assertFileScanTaskEquals(fileScanTask, deserializedTask, spec, caseSensitive); } @@ -64,6 +65,24 @@ private FileScanTask createScanTask(PartitionSpec spec, boolean caseSensitive) { residualEvaluator); } + private String expectedFileScanTaskJson() { + return "{\"schema\":{\"type\":\"struct\",\"schema-id\":0,\"fields\":[" + + "{\"id\":3,\"name\":\"id\",\"required\":true,\"type\":\"int\"}," + + "{\"id\":4,\"name\":\"data\",\"required\":true,\"type\":\"string\"}]}," + + "\"spec\":{\"spec-id\":0,\"fields\":[{\"name\":\"data_bucket\"," + + "\"transform\":\"bucket[16]\",\"source-id\":4,\"field-id\":1000}]}," + + "\"data-file\":{\"spec-id\":0,\"content\":\"DATA\",\"file-path\":\"/path/to/data-a.parquet\"," + + "\"file-format\":\"PARQUET\",\"partition\":{\"1000\":0}," + + "\"file-size-in-bytes\":10,\"record-count\":1,\"sort-order-id\":0}," + + "\"delete-files\":[{\"spec-id\":0,\"content\":\"POSITION_DELETES\"," + + "\"file-path\":\"/path/to/data-a-deletes.parquet\",\"file-format\":\"PARQUET\"," + + "\"partition\":{\"1000\":0},\"file-size-in-bytes\":10,\"record-count\":1}," + + "{\"spec-id\":0,\"content\":\"EQUALITY_DELETES\",\"file-path\":\"/path/to/data-a2-deletes.parquet\"," + + "\"file-format\":\"PARQUET\",\"partition\":{\"1000\":0},\"file-size-in-bytes\":10," + + "\"record-count\":1,\"equality-ids\":[1],\"sort-order-id\":0}]," + + "\"residual-filter\":{\"type\":\"eq\",\"term\":\"id\",\"value\":1}}"; + } + private static void assertFileScanTaskEquals( FileScanTask expected, FileScanTask actual, PartitionSpec spec, boolean caseSensitive) { TestContentFileParser.assertContentFileEquals(expected.file(), actual.file(), spec); diff --git a/core/src/test/java/org/apache/iceberg/util/TestJsonUtil.java b/core/src/test/java/org/apache/iceberg/util/TestJsonUtil.java index c8ba7b61efb5..dee4c0c8c40a 100644 --- a/core/src/test/java/org/apache/iceberg/util/TestJsonUtil.java +++ b/core/src/test/java/org/apache/iceberg/util/TestJsonUtil.java @@ -186,7 +186,7 @@ public void getByteBufferOrNull() throws JsonProcessingException { Assertions.assertThatThrownBy( () -> JsonUtil.getByteBufferOrNull("x", JsonUtil.mapper().readTree("{\"x\": 23}"))) .isInstanceOf(IllegalArgumentException.class) - .hasMessage("Cannot parse default as a binary value: 23"); + .hasMessage("Cannot parse from non-text value: x: 23"); } @Test From fddc4bbd125a100bd633c6760663db61fe567824 Mon Sep 17 00:00:00 2001 From: Steven Wu Date: Thu, 4 May 2023 13:21:09 -0700 Subject: [PATCH 10/12] improve error msg from JsonUtil --- .../java/org/apache/iceberg/util/JsonUtil.java | 6 +++--- .../org/apache/iceberg/util/TestJsonUtil.java | 16 ++++++++-------- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/core/src/main/java/org/apache/iceberg/util/JsonUtil.java b/core/src/main/java/org/apache/iceberg/util/JsonUtil.java index d312e0983be0..aa90c63f80da 100644 --- a/core/src/main/java/org/apache/iceberg/util/JsonUtil.java +++ b/core/src/main/java/org/apache/iceberg/util/JsonUtil.java @@ -183,7 +183,7 @@ public static ByteBuffer getByteBufferOrNull(String property, JsonNode node) { JsonNode pNode = node.get(property); Preconditions.checkArgument( - pNode.isTextual(), "Cannot parse from non-text value: %s: %s", property, pNode); + pNode.isTextual(), "Cannot parse byte buffer from non-text value: %s: %s", property, pNode); return ByteBuffer.wrap( BaseEncoding.base16().decode(pNode.textValue().toUpperCase(Locale.ROOT))); } @@ -193,7 +193,7 @@ public static Map getStringMap(String property, JsonNode node) { JsonNode pNode = node.get(property); Preconditions.checkArgument( pNode != null && !pNode.isNull() && pNode.isObject(), - "Cannot parse from non-object value: %s: %s", + "Cannot parse string map from non-object value: %s: %s", property, pNode); @@ -322,7 +322,7 @@ abstract static class JsonArrayIterator implements Iterator { JsonNode pNode = node.get(property); Preconditions.checkArgument( pNode != null && !pNode.isNull() && pNode.isArray(), - "Cannot parse from non-array value: %s: %s", + "Cannot parse JSON array from non-array value: %s: %s", property, pNode); this.elements = pNode.elements(); diff --git a/core/src/test/java/org/apache/iceberg/util/TestJsonUtil.java b/core/src/test/java/org/apache/iceberg/util/TestJsonUtil.java index dee4c0c8c40a..75191f4d706f 100644 --- a/core/src/test/java/org/apache/iceberg/util/TestJsonUtil.java +++ b/core/src/test/java/org/apache/iceberg/util/TestJsonUtil.java @@ -186,7 +186,7 @@ public void getByteBufferOrNull() throws JsonProcessingException { Assertions.assertThatThrownBy( () -> JsonUtil.getByteBufferOrNull("x", JsonUtil.mapper().readTree("{\"x\": 23}"))) .isInstanceOf(IllegalArgumentException.class) - .hasMessage("Cannot parse from non-text value: x: 23"); + .hasMessage("Cannot parse byte buffer from non-text value: x: 23"); } @Test @@ -248,7 +248,7 @@ public void getIntegerList() throws JsonProcessingException { Assertions.assertThatThrownBy( () -> JsonUtil.getIntegerList("items", JsonUtil.mapper().readTree("{\"items\": null}"))) .isInstanceOf(IllegalArgumentException.class) - .hasMessage("Cannot parse from non-array value: items: null"); + .hasMessage("Cannot parse JSON array from non-array value: items: null"); Assertions.assertThatThrownBy( () -> @@ -284,7 +284,7 @@ public void getIntegerSet() throws JsonProcessingException { Assertions.assertThatThrownBy( () -> JsonUtil.getIntegerSet("items", JsonUtil.mapper().readTree("{\"items\": null}"))) .isInstanceOf(IllegalArgumentException.class) - .hasMessage("Cannot parse from non-array value: items: null"); + .hasMessage("Cannot parse JSON array from non-array value: items: null"); Assertions.assertThatThrownBy( () -> @@ -330,7 +330,7 @@ public void getLongList() throws JsonProcessingException { Assertions.assertThatThrownBy( () -> JsonUtil.getLongList("items", JsonUtil.mapper().readTree("{\"items\": null}"))) .isInstanceOf(IllegalArgumentException.class) - .hasMessage("Cannot parse from non-array value: items: null"); + .hasMessage("Cannot parse JSON array from non-array value: items: null"); Assertions.assertThatThrownBy( () -> @@ -388,7 +388,7 @@ public void getLongSet() throws JsonProcessingException { Assertions.assertThatThrownBy( () -> JsonUtil.getLongSet("items", JsonUtil.mapper().readTree("{\"items\": null}"))) .isInstanceOf(IllegalArgumentException.class) - .hasMessage("Cannot parse from non-array value: items: null"); + .hasMessage("Cannot parse JSON array from non-array value: items: null"); Assertions.assertThatThrownBy( () -> @@ -433,7 +433,7 @@ public void getStringList() throws JsonProcessingException { Assertions.assertThatThrownBy( () -> JsonUtil.getStringList("items", JsonUtil.mapper().readTree("{\"items\": null}"))) .isInstanceOf(IllegalArgumentException.class) - .hasMessage("Cannot parse from non-array value: items: null"); + .hasMessage("Cannot parse JSON array from non-array value: items: null"); Assertions.assertThatThrownBy( () -> @@ -492,7 +492,7 @@ public void getStringSet() throws JsonProcessingException { Assertions.assertThatThrownBy( () -> JsonUtil.getStringSet("items", JsonUtil.mapper().readTree("{\"items\": null}"))) .isInstanceOf(IllegalArgumentException.class) - .hasMessage("Cannot parse from non-array value: items: null"); + .hasMessage("Cannot parse JSON array from non-array value: items: null"); Assertions.assertThatThrownBy( () -> @@ -517,7 +517,7 @@ public void getStringMap() throws JsonProcessingException { Assertions.assertThatThrownBy( () -> JsonUtil.getStringMap("items", JsonUtil.mapper().readTree("{\"items\": null}"))) .isInstanceOf(IllegalArgumentException.class) - .hasMessage("Cannot parse from non-object value: items: null"); + .hasMessage("Cannot parse string map from non-object value: items: null"); Assertions.assertThatThrownBy( () -> From 6ecd0e4f088d9b2b5c665e2f36e4b6d0d30c1756 Mon Sep 17 00:00:00 2001 From: Steven Wu Date: Thu, 4 May 2023 14:38:30 -0700 Subject: [PATCH 11/12] fix TestTableIdentifierParser due to error msg change in JsonUtil --- .../org/apache/iceberg/catalog/TestTableIdentifierParser.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/core/src/test/java/org/apache/iceberg/catalog/TestTableIdentifierParser.java b/core/src/test/java/org/apache/iceberg/catalog/TestTableIdentifierParser.java index 04972e5cd625..587b9395f514 100644 --- a/core/src/test/java/org/apache/iceberg/catalog/TestTableIdentifierParser.java +++ b/core/src/test/java/org/apache/iceberg/catalog/TestTableIdentifierParser.java @@ -95,7 +95,7 @@ public void testFailWhenFieldsHaveInvalidValues() { String invalidNamespace = "{\"namespace\":\"accounting.tax\",\"name\":\"paid\"}"; Assertions.assertThatThrownBy(() -> TableIdentifierParser.fromJson(invalidNamespace)) .isInstanceOf(IllegalArgumentException.class) - .hasMessage("Cannot parse from non-array value: namespace: \"accounting.tax\""); + .hasMessage("Cannot parse JSON array from non-array value: namespace: \"accounting.tax\""); String invalidName = "{\"namespace\":[\"accounting\",\"tax\"],\"name\":1234}"; Assertions.assertThatThrownBy(() -> TableIdentifierParser.fromJson(invalidName)) From 81058110f291eec57db8719f7811710995aed4db Mon Sep 17 00:00:00 2001 From: Steven Wu Date: Sun, 25 Jun 2023 06:51:20 -0700 Subject: [PATCH 12/12] fix test after rebase --- .../apache/iceberg/rest/responses/TestListTablesResponse.java | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/core/src/test/java/org/apache/iceberg/rest/responses/TestListTablesResponse.java b/core/src/test/java/org/apache/iceberg/rest/responses/TestListTablesResponse.java index 9e8bffaf6d7f..116d43a6d147 100644 --- a/core/src/test/java/org/apache/iceberg/rest/responses/TestListTablesResponse.java +++ b/core/src/test/java/org/apache/iceberg/rest/responses/TestListTablesResponse.java @@ -67,7 +67,8 @@ public void testDeserializeInvalidResponsesThrows() { "{\"identifiers\":[{\"namespace\":\"accounting.tax\",\"name\":\"paid\"}]}"; Assertions.assertThatThrownBy(() -> deserialize(jsonWithInvalidIdentifiersInList)) .isInstanceOf(JsonProcessingException.class) - .hasMessageContaining("Cannot parse from non-array value"); + .hasMessageContaining( + "Cannot parse JSON array from non-array value: namespace: \"accounting.tax\""); String jsonWithInvalidIdentifiersInList2 = "{\"identifiers\":[{\"namespace\":[\"accounting\",\"tax\"],\"name\":\"paid\"},\"accounting.tax.paid\"]}";