Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
74 commits
Select commit Hold shift + click to select a range
09cd709
Hive Catalog: Add a hive catalog that does not override existing Hive…
shardulm94 Oct 25, 2019
1dd5c14
Shading: Add a iceberg-runtime shaded module (#12)
rdsr Nov 4, 2019
21c0276
ORC: Add test for reading files without Iceberg IDs (#16)
shardulm94 Dec 11, 2019
5373637
Hive Metadata Scan: Support reading tables with only Hive metadata (#…
shardulm94 Mar 18, 2020
80e6317
Row level filtering: Allow table scans to pass a row level filter for…
shardulm94 May 22, 2020
d2e4f4b
Hive: Made Predicate Pushdown dynamic based on the Hive Version
HotSushi Sep 17, 2020
d959c40
Hive: Fix uppercase bug and determine catalog from table properties (…
HotSushi Oct 2, 2020
c51911d
Hive: Fix schema not forwarded to SerDe on MR jobs (#45) (#47)
HotSushi Nov 20, 2020
5a5adcd
Hive Metadata Scan: Support case insensitive name mapping (#52)
shardulm94 Jan 11, 2021
854fe29
Hive Metadata Scan: Merge Hive and Avro schemas to fix datatype incon…
shardulm94 Feb 26, 2021
eb26358
Stop using serdeToFileFormat to unblock formats other than Avro or Or…
rzhang10 Apr 13, 2021
37c9ae2
Do not delete metadata location when HMS has been successfully update…
ZihanLi58 May 12, 2021
4bdb0c5
Support reading Avro complex union types (#73)
funcheetah Jun 4, 2021
15df78f
[#2039] Support default value semantic for AVRO (#75)
Jun 21, 2021
1e1b4b9
Support hive non string partition cols (#78)
rzhang10 Jul 13, 2021
25a7201
Support default value read for ORC format in spark (#76)
rzhang10 Jul 19, 2021
c67ec8b
Support reading ORC complex union types (#74)
funcheetah Jul 24, 2021
52ec9b9
Support avro.schema.literal/hive union types in Hive legacy table to …
rzhang10 Jul 28, 2021
fc51e32
Fix ORC schema visitors to support reading ORC files with deeply nest…
rzhang10 Sep 7, 2021
1757af7
Disable avro validation for default values
Sep 21, 2021
1f63d32
Fix spark avro reader reading union schema data (#83)
rzhang10 Oct 22, 2021
973e3dc
Avro: Change union read schema from hive to trino (#84)
rzhang10 Dec 7, 2021
611b256
ORC: Change union read schema from hive to trino (#85)
rzhang10 Dec 7, 2021
987eb1d
Recorder hive table properties to align the avro.schema.literal place…
rzhang10 Dec 8, 2021
8cc2711
[#2039] Support default value semantic for AVRO
May 12, 2021
64ed521
reverting commits 2c59857a and f362aed6 (#88)
Feb 1, 2022
7d41f3e
logically patching PR 2328 on HiveMetadataPreservingTableOperations
autumnust Feb 3, 2022
33bd0da
Support timestamp as partition type (#91)
ljfgem Feb 18, 2022
1891a1a
Separate classes under hive legacy package to new hivelink module (#87)
funcheetah Mar 7, 2022
a8bf3fb
[LI] Align default value validation align with avro semantics in term…
rzhang10 Mar 8, 2022
e7149e0
[LI][Spark][Avro] read avro union using decoder instead of directly r…
rzhang10 Mar 16, 2022
7b41f2f
Improve the logging when the deserailzed index is invalid to read the…
yiqiangin Apr 15, 2022
89659e7
Move custom hive catalog to hivelink-core (#99)
ljfgem Apr 21, 2022
98bec68
Handle non-nullable union of single type for Avro (#98)
funcheetah Apr 22, 2022
85b8699
Handle null default in nested type default value situations (#100)
rzhang10 Apr 28, 2022
b77b362
Move 'Hive Metadata Scan: Support case insensitive name mapping' (PR …
ljfgem May 9, 2022
a3be29e
Remove activeSparkSession (#103)
ljfgem May 12, 2022
955de3b
Disable default value preserving (#106)
rzhang10 May 12, 2022
52f5420
[LI][Avro] Do not reorder elements inside a Avro union schema (#93)
rzhang10 May 12, 2022
e7e137e
handle single type union properly in AvroSchemaVisitor for deep neste…
funcheetah May 13, 2022
6b17f13
Handle non-nullable union of single type for ORC spark non-vectorized…
funcheetah May 16, 2022
c34bfdf
[Avro] Retain the type of field while copying the default values. (#109)
maluchari May 24, 2022
0060364
[Hivelink] Refactor support hive non string partition cols to rid of …
rzhang10 May 27, 2022
f78b3bb
Release automation overhaul: Sonatype Nexus, Shipkit and GH Actions (…
ljfgem Jun 1, 2022
dc67196
Add scm and developer info (#111)
ljfgem Jun 2, 2022
85b64b7
[Core] Fix and refactor schema parser (#112)
rzhang10 Jun 7, 2022
4ed6206
Enhance the UT for testing required fields with default values (#113)
maluchari Jun 30, 2022
56bd803
Support single type union for ORC-vectorization reader (#114)
yiqiangin Jul 12, 2022
6a8ddb2
Refactor HMS code upon cherry-pick
rzhang10 Jul 18, 2022
af26829
Check for schema corruption and fix it on commit (#117)
jack-moseley Jul 27, 2022
753fb4e
ORC: Handle query where select and filter only uses default value col…
rzhang10 Aug 1, 2022
b14596b
Set ORC columns and fix case-sensitivity issue with schema check (#119)
jack-moseley Aug 9, 2022
1181128
Hive: Return null for currentSnapshot() (#121)
shardulm94 Aug 15, 2022
26b7e11
Fix MergeHiveSchemaWithAvro to make it copy full Avro schema attribut…
rzhang10 Aug 16, 2022
3b6c98b
Add logic to derive partition column id from partition.column.ids pro…
rzhang10 Aug 16, 2022
0b0505d
Do not push down filter to ORC for union type schema (#123)
rzhang10 Aug 23, 2022
d49dac4
Bug fix: MergeHiveSchemaWithAvro should retain avro properties for li…
rzhang10 Sep 22, 2022
c39b754
LinkedIn rebase draft
rzhang10 Sep 28, 2022
7968801
Refactor hivelink 1
rzhang10 Sep 30, 2022
c6c4458
Make hivelink module test all pass
rzhang10 Sep 30, 2022
62f9ff3
Make spark 2.4 module work
rzhang10 Oct 11, 2022
47c32df
Fix mr module
rzhang10 Oct 11, 2022
3f35222
Make spark 3.1 module work
rzhang10 Oct 12, 2022
3d955d8
Fix TestSparkMetadataColumns
rzhang10 Oct 12, 2022
656c692
Minor fix for spark 2.4
rzhang10 Oct 12, 2022
343fd1c
Update default spark version to 3.1
rzhang10 Oct 12, 2022
794a46b
Update java ci to only run spark 2.4 and 3.1
rzhang10 Oct 12, 2022
d46aaae
Minor fix HiveTableOperations
rzhang10 Oct 12, 2022
8c6c8e2
Adapt github CI to 0.14.x branch
rzhang10 Oct 12, 2022
ca81b95
Fix mr module checkstyle
rzhang10 Oct 12, 2022
d63a059
Fix checkstyle for orc module
rzhang10 Oct 12, 2022
555259a
Fix spark2.4 checkstyle
rzhang10 Oct 12, 2022
35c4650
Refactor catalog loading logic using CatalogUtil
rzhang10 Oct 21, 2022
f158233
Minor change to CI/release
rzhang10 Oct 21, 2022
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
78 changes: 53 additions & 25 deletions .github/workflows/java-ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -21,11 +21,11 @@ name: "Java CI"
on:
push:
branches:
- 'master'
- '0.**'
tags:
- 'apache-iceberg-**'
- 'li-0.14.x'
tags-ignore: [v*] # release tags are autogenerated after a successful CI, no need to run CI against them
pull_request:
branches:
- 'li-0.14.x'
paths-ignore:
- '.github/workflows/python-ci.yml'
- '.github/workflows/spark-ci.yml'
Expand Down Expand Up @@ -53,28 +53,28 @@ jobs:
runs-on: ubuntu-20.04
strategy:
matrix:
jvm: [8, 11]
jvm: [ 8, 11 ]
env:
SPARK_LOCAL_IP: localhost
steps:
- uses: actions/checkout@v3
- uses: actions/setup-java@v3
with:
distribution: zulu
java-version: ${{ matrix.jvm }}
- uses: actions/cache@v3
with:
path: ~/.gradle/caches
key: ${{ runner.os }}-gradle-${{ hashFiles('**/*.gradle') }}
restore-keys: ${{ runner.os }}-gradle
- run: echo -e "$(ip addr show eth0 | grep "inet\b" | awk '{print $2}' | cut -d/ -f1)\t$(hostname -f) $(hostname -s)" | sudo tee -a /etc/hosts
- run: ./gradlew check -DsparkVersions= -DhiveVersions= -DflinkVersions= -Pquick=true -x javadoc
- uses: actions/upload-artifact@v3
if: failure()
with:
name: test logs
path: |
**/build/testlogs
- uses: actions/checkout@v3
- uses: actions/setup-java@v3
with:
distribution: zulu
java-version: ${{ matrix.jvm }}
- uses: actions/cache@v3
with:
path: ~/.gradle/caches
key: ${{ runner.os }}-gradle-${{ hashFiles('**/*.gradle') }}
restore-keys: ${{ runner.os }}-gradle
- run: echo -e "$(ip addr show eth0 | grep "inet\b" | awk '{print $2}' | cut -d/ -f1)\t$(hostname -f) $(hostname -s)" | sudo tee -a /etc/hosts
- run: ./gradlew check -DsparkVersions= -DhiveVersions= -DflinkVersions= -Pquick=true -x javadoc
- uses: actions/upload-artifact@v3
if: failure()
with:
name: test logs
path: |
**/build/testlogs

build-checks:
runs-on: ubuntu-20.04
Expand All @@ -84,7 +84,7 @@ jobs:
with:
distribution: zulu
java-version: 8
- run: ./gradlew -DflinkVersions=1.13,1.14,1.15 -DsparkVersions=2.4,3.0,3.1,3.2,3.3 -DhiveVersions=2,3 build -x test -x javadoc -x integrationTest
- run: ./gradlew -DflinkVersions=1.13,1.14,1.15 -DsparkVersions=2.4,3.1 -DhiveVersions=2,3 build -x test -x javadoc -x integrationTest

build-javadoc:
runs-on: ubuntu-20.04
Expand All @@ -94,4 +94,32 @@ jobs:
with:
distribution: zulu
java-version: 8
- run: ./gradlew -Pquick=true javadoc
- run: ./gradlew -P=true javadoc

release:
if: ${{ github.event_name == 'push' }}
runs-on: ubuntu-20.04
steps:
- uses: actions/checkout@v3
with:
fetch-depth: '0' # https://github.com/shipkit/shipkit-changelog#fetch-depth-on-ci
- uses: actions/setup-java@v3
with:
distribution: zulu
java-version: 8
- run: echo -e "$(ip addr show eth0 | grep "inet\b" | awk '{print $2}' | cut -d/ -f1)\t$(hostname -f) $(hostname -s)" | sudo tee -a /etc/hosts
- run: ./gradlew build -DflinkVersions= -DsparkVersions=2.4,3.1 -DhiveVersions= -Pquick=true build -x javadoc
- name: Perform release
# Release job, only for pushes to the main development branch
if: ${{ github.event_name == 'push'
&& github.ref == 'refs/heads/li-0.14.x'
&& github.repository == 'linkedin/iceberg'
&& !contains(toJSON(github.event.commits.*.message), '[skip release]') }}

run: ./gradlew -DflinkVersions= -DsparkVersions=2.4,3.1 -DhiveVersions= githubRelease publishToSonatype closeAndReleaseStagingRepository
env:
GITHUB_TOKEN: ${{secrets.GITHUB_TOKEN}}
SONATYPE_USER: ${{secrets.SONATYPE_USER}}
SONATYPE_PWD: ${{secrets.SONATYPE_PWD}}
PGP_KEY: ${{secrets.PGP_KEY}}
PGP_PWD: ${{secrets.PGP_PWD}}
6 changes: 3 additions & 3 deletions .github/workflows/spark-ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,7 @@ name: "Spark CI"
on:
push:
branches:
- 'master'
- '0.**'
- 'li-0.14.x'
tags:
- 'apache-iceberg-**'
pull_request:
Expand Down Expand Up @@ -83,7 +82,7 @@ jobs:
strategy:
matrix:
jvm: [8, 11]
spark: ['3.0', '3.1', '3.2', '3.3']
spark: ['3.1']
env:
SPARK_LOCAL_IP: localhost
steps:
Expand All @@ -107,6 +106,7 @@ jobs:
**/build/testlogs

spark-3x-scala-2-13-tests:
if: ${{ false }}
runs-on: ubuntu-20.04
strategy:
matrix:
Expand Down
8 changes: 4 additions & 4 deletions api/src/main/java/org/apache/iceberg/types/PruneColumns.java
Original file line number Diff line number Diff line change
Expand Up @@ -67,11 +67,11 @@ public Type struct(Types.StructType struct, List<Type> fieldResults) {
} else if (projectedType != null) {
sameTypes = false; // signal that some types were altered
if (field.isOptional()) {
selectedFields.add(
Types.NestedField.optional(field.fieldId(), field.name(), projectedType, field.doc()));
selectedFields.add(Types.NestedField.optional(
field.fieldId(), field.name(), projectedType, field.getDefaultValue(), field.doc()));
} else {
selectedFields.add(
Types.NestedField.required(field.fieldId(), field.name(), projectedType, field.doc()));
selectedFields.add(Types.NestedField.required(
field.fieldId(), field.name(), projectedType, field.getDefaultValue(), field.doc()));
}
}
}
Expand Down
102 changes: 91 additions & 11 deletions api/src/main/java/org/apache/iceberg/types/Types.java
Original file line number Diff line number Diff line change
Expand Up @@ -415,42 +415,110 @@ public int hashCode() {

public static class NestedField implements Serializable {
public static NestedField optional(int id, String name, Type type) {
return new NestedField(true, id, name, type, null);
return new NestedField(true, id, name, type, null, null);
}

public static NestedField optional(int id, String name, Type type, String doc) {
return new NestedField(true, id, name, type, doc);
return new NestedField(true, id, name, type, null, doc);
}

public static NestedField optional(int id, String name, Type type, Object defaultValue, String doc) {
return new NestedField(true, id, name, type, defaultValue, doc);
}

public static NestedField required(int id, String name, Type type) {
return new NestedField(false, id, name, type, null);
return new NestedField(false, id, name, type, null, null);
}

public static NestedField required(int id, String name, Type type, String doc) {
return new NestedField(false, id, name, type, doc);
return new NestedField(false, id, name, type, null, doc);
}

public static NestedField required(int id, String name, Type type, Object defaultValue, String doc) {
return new NestedField(false, id, name, type, defaultValue, doc);
}

public static NestedField of(int id, boolean isOptional, String name, Type type) {
return new NestedField(isOptional, id, name, type, null);
return new NestedField(isOptional, id, name, type, null, null);
}

public static NestedField of(int id, boolean isOptional, String name, Type type, String doc) {
return new NestedField(isOptional, id, name, type, doc);
return new NestedField(isOptional, id, name, type, null, doc);
}

public static NestedField of(int id, boolean isOptional, String name, Type type, Object defaultValue, String doc) {
return new NestedField(isOptional, id, name, type, defaultValue, doc);
}

private static void validateDefaultValue(Object defaultValue, Type type) {
if (defaultValue == null) {
return;
}
switch (type.typeId()) {
case STRUCT:
Preconditions.checkArgument(defaultValue instanceof Map,
"defaultValue should be a Map from fields names to values, for StructType");
Map<String, Object> defaultStruct = (Map<String, Object>) defaultValue;
if (defaultStruct.isEmpty()) {
return;
}
for (NestedField field : type.asStructType().fields()) {
validateDefaultValue(defaultStruct.getOrDefault(field.name(), field.getDefaultValue()), field.type());
}
break;

case LIST:
Preconditions.checkArgument(defaultValue instanceof List,
"defaultValue should be an List of Objects, for ListType");
List<Object> defaultList = (List<Object>) defaultValue;
if (defaultList.size() == 0) {
return;
}
defaultList.forEach(dv -> NestedField.validateDefaultValue(dv, type.asListType().elementField.type));
break;

case MAP:
Preconditions.checkArgument(defaultValue instanceof Map,
"defaultValue should be an instance of Map for MapType");
Map<Object, Object> defaultMap = (Map<Object, Object>) defaultValue;
if (defaultMap.isEmpty()) {
return;
}
for (Map.Entry<Object, Object> e : defaultMap.entrySet()) {
NestedField.validateDefaultValue(e.getKey(), type.asMapType().keyField.type);
NestedField.validateDefaultValue(e.getValue(), type.asMapType().valueField.type);
}
break;

case FIXED:
case BINARY:
Preconditions.checkArgument(defaultValue instanceof byte[],
"defaultValue should be an instance of byte[] for TypeId.%s, but defaultValue.class = %s",
type.typeId().name(), defaultValue.getClass().getCanonicalName());
break;

default:
Preconditions.checkArgument(type.typeId().javaClass().isInstance(defaultValue),
"defaultValue should be and instance of %s for TypeId.%s, but defaultValue.class = %s",
type.typeId().javaClass(), type.typeId().name(), defaultValue.getClass().getCanonicalName());
}
}
private final boolean isOptional;
private final int id;
private final String name;
private final Type type;
private final Object defaultValue;
private final String doc;

private NestedField(boolean isOptional, int id, String name, Type type, String doc) {
private NestedField(boolean isOptional, int id, String name, Type type, Object defaultValue, String doc) {
Preconditions.checkNotNull(name, "Name cannot be null");
Preconditions.checkNotNull(type, "Type cannot be null");
validateDefaultValue(defaultValue, type);
this.isOptional = isOptional;
this.id = id;
this.name = name;
this.type = type;
this.defaultValue = defaultValue;
this.doc = doc;
}

Expand All @@ -462,7 +530,7 @@ public NestedField asOptional() {
if (isOptional) {
return this;
}
return new NestedField(true, id, name, type, doc);
return new NestedField(true, id, name, type, defaultValue, doc);
}

public boolean isRequired() {
Expand All @@ -473,7 +541,15 @@ public NestedField asRequired() {
if (!isOptional) {
return this;
}
return new NestedField(false, id, name, type, doc);
return new NestedField(false, id, name, type, defaultValue, doc);
}

public boolean hasDefaultValue() {
return defaultValue != null;
}

public Object getDefaultValue() {
return defaultValue;
}

public int fieldId() {
Expand All @@ -496,6 +572,7 @@ public String doc() {
public String toString() {
return String.format("%d: %s: %s %s",
id, name, isOptional ? "optional" : "required", type) +
(hasDefaultValue() ? ", default value: " + defaultValue + ", " : "") +
(doc != null ? " (" + doc + ")" : "");
}

Expand All @@ -514,6 +591,9 @@ public boolean equals(Object o) {
return false;
} else if (!name.equals(that.name)) {
return false;
} else if (!Objects.equals(defaultValue, that.defaultValue) &&
!Arrays.equals((byte[]) defaultValue, (byte[]) that.defaultValue)) {
return false;
} else if (!Objects.equals(doc, that.doc)) {
return false;
}
Expand All @@ -522,7 +602,8 @@ public boolean equals(Object o) {

@Override
public int hashCode() {
return Objects.hash(NestedField.class, id, isOptional, name, type);
return hasDefaultValue() ? Objects.hash(NestedField.class, id, isOptional, name, type, defaultValue) :
Objects.hash(NestedField.class, id, isOptional, name, type);
}
}

Expand Down Expand Up @@ -740,7 +821,6 @@ public boolean equals(Object o) {
} else if (!(o instanceof ListType)) {
return false;
}

ListType listType = (ListType) o;
return elementField.equals(listType.elementField);
}
Expand Down
Loading