Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 24 additions & 1 deletion core/src/main/java/org/apache/iceberg/avro/ValueReaders.java
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
import java.util.Map;
import java.util.Objects;
import java.util.UUID;
import java.util.function.BiFunction;
import java.util.function.Supplier;
import org.apache.avro.Schema;
import org.apache.avro.generic.GenericData;
Expand All @@ -44,6 +45,7 @@
import org.apache.iceberg.common.DynConstructors;
import org.apache.iceberg.relocated.com.google.common.collect.Lists;
import org.apache.iceberg.relocated.com.google.common.collect.Maps;
import org.apache.iceberg.types.Type;
import org.apache.iceberg.types.Types;
import org.apache.iceberg.util.Pair;
import org.apache.iceberg.util.UUIDUtil;
Expand Down Expand Up @@ -199,6 +201,25 @@ public static List<Pair<Integer, ValueReader<?>>> buildReadPlan(
Schema record,
List<ValueReader<?>> fieldReaders,
Map<Integer, ?> idToConstant) {
return buildReadPlan(expected, record, fieldReaders, idToConstant, (type, value) -> value);
}

/**
* Builds a read plan for record classes that use planned reads instead of a ResolvingDecoder.
*
* @param expected expected StructType
* @param record Avro record schema
* @param fieldReaders list of readers for each field in the Avro record schema
* @param idToConstant a map of field ID to constants values
* @param convert function to convert from internal classes to the target object model
* @return a read plan that is a list of (position, reader) pairs
*/
public static List<Pair<Integer, ValueReader<?>>> buildReadPlan(
Types.StructType expected,
Schema record,
List<ValueReader<?>> fieldReaders,
Map<Integer, ?> idToConstant,
BiFunction<Type, Object, Object> convert) {
Map<Integer, Integer> idToPos = idToPos(expected);

List<Pair<Integer, ValueReader<?>>> readPlan = Lists.newArrayList();
Expand Down Expand Up @@ -228,7 +249,9 @@ public static List<Pair<Integer, ValueReader<?>>> buildReadPlan(
if (constant != null) {
readPlan.add(Pair.of(pos, ValueReaders.constant(constant)));
} else if (field.initialDefault() != null) {
readPlan.add(Pair.of(pos, ValueReaders.constant(field.initialDefault())));
readPlan.add(
Pair.of(
pos, ValueReaders.constant(convert.apply(field.type(), field.initialDefault()))));
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This raises a question: should we also convert the constants passed by idToConstant?

I think we should but not yet. The problem is that this conversion probably can't be done twice because of assumptions in the conversion (like casting strings to UTF8String in Spark) so it is worth a separate PR to refactor all of the conversions to be done in one place. That will also be different for each file format so it could get messy.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Let's do that in a separate PR indeed 👍

} else if (fieldId == MetadataColumns.IS_DELETED.fieldId()) {
readPlan.add(Pair.of(pos, ValueReaders.constant(false)));
} else if (fieldId == MetadataColumns.ROW_POSITION.fieldId()) {
Expand Down
60 changes: 60 additions & 0 deletions core/src/main/java/org/apache/iceberg/data/GenericDataUtil.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.iceberg.data;

import java.nio.ByteBuffer;
import org.apache.iceberg.types.Type;
import org.apache.iceberg.types.Types;
import org.apache.iceberg.util.ByteBuffers;
import org.apache.iceberg.util.DateTimeUtil;

/** Utility methods for working with Iceberg's generic data model */
public class GenericDataUtil {
private GenericDataUtil() {}

/**
* Convert a value from Iceberg's internal data model to the generic data model.
*
* @param type a data type
* @param value value to convert
* @return the value in the generic data model representation
*/
public static Object internalToGeneric(Type type, Object value) {
if (null == value) {
return null;
}

switch (type.typeId()) {
case DATE:
return DateTimeUtil.dateFromDays((Integer) value);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
return DateTimeUtil.dateFromDays((Integer) value);
return DateTimeUtil.dateFromDays((int) value);

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Value is an Object, so I cast to the object rather than the primitive. Looks like you're right that we can cast directly and unbox at the same time based on the method signature... but I typically don't do that for style reasons. If you cast to the primitive, then there's a risk of introducing an unnecessary failure if the value is null and the receiving function accepts an Integer that can be null. I think casting to the object version is better to avoid those problems.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

My reasoning is similar, but the other way around:

As you mentioned, the method accepts an int:
https://github.com/apache/iceberg/blob/91a1505d09cebcd1d088ac53cd42732c343883de/api/src/main/java/org/ap
ache/iceberg/util/DateTimeUtil.java#L49-L51

Casting to an Integer gives the impression to me that a null is acceptable here, while it is not. I'm not very strong on this one, let's pull in @nastra as a tiebreaker :)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I agree with Ryan's reasoning. Casting to int will throw a NPE because it will first cast it to Integer and then throw the NPE when doing Integer.intValue(). That being said, it's safer to always cast to the object type instead of the primitive in case the value can be null

case TIME:
return DateTimeUtil.timeFromMicros((Long) value);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
return DateTimeUtil.timeFromMicros((Long) value);
return DateTimeUtil.timeFromMicros((long) value);

case TIMESTAMP:
if (((Types.TimestampType) type).shouldAdjustToUTC()) {
return DateTimeUtil.timestamptzFromMicros((Long) value);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
return DateTimeUtil.timestamptzFromMicros((Long) value);
return DateTimeUtil.timestamptzFromMicros((long) value);

} else {
return DateTimeUtil.timestampFromMicros((Long) value);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
return DateTimeUtil.timestampFromMicros((Long) value);
return DateTimeUtil.timestampFromMicros((long) value);

}
case FIXED:
return ByteBuffers.toByteArray((ByteBuffer) value);
}

return value;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@
import org.apache.iceberg.avro.SupportsRowPosition;
import org.apache.iceberg.avro.ValueReader;
import org.apache.iceberg.avro.ValueReaders;
import org.apache.iceberg.data.GenericDataUtil;
import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap;
import org.apache.iceberg.types.Type;
import org.apache.iceberg.types.Types;
Expand Down Expand Up @@ -97,7 +98,8 @@ public ValueReader<?> record(Type partner, Schema record, List<ValueReader<?>> f

Types.StructType expected = partner.asStructType();
List<Pair<Integer, ValueReader<?>>> readPlan =
ValueReaders.buildReadPlan(expected, record, fieldReaders, idToConstant);
ValueReaders.buildReadPlan(
expected, record, fieldReaders, idToConstant, GenericDataUtil::internalToGeneric);
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This change is only needed for the generic data model and not for the internal data model because defaults are already using the correct classes for internal.


return GenericReaders.struct(readPlan, expected);
}
Expand Down
Loading