Skip to content

Commit e2db246

Browse files
committed
ORC-477: BloomFilter for ACID table does not get created
1 parent 4e7d9c2 commit e2db246

File tree

3 files changed

+139
-19
lines changed

3 files changed

+139
-19
lines changed

java/core/src/java/org/apache/orc/OrcUtils.java

Lines changed: 37 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -18,12 +18,15 @@
1818
package org.apache.orc;
1919

2020
import org.apache.orc.impl.ReaderImpl;
21+
import org.apache.orc.impl.SchemaEvolution;
2122

2223
import java.io.IOException;
2324
import java.util.ArrayList;
2425
import java.util.Arrays;
2526
import java.util.List;
2627

28+
import static org.apache.hadoop.util.StringUtils.COMMA_STR;
29+
2730
public class OrcUtils {
2831

2932
/**
@@ -51,14 +54,16 @@ public static boolean[] includeColumns(String selectedColumns,
5154
Arrays.fill(results, true);
5255
return results;
5356
}
57+
TypeDescription baseSchema = SchemaEvolution.checkAcidSchema(schema) ?
58+
SchemaEvolution.getBaseRow(schema) : schema;
59+
5460
if (selectedColumns != null &&
55-
schema.getCategory() == TypeDescription.Category.STRUCT) {
56-
List<String> fieldNames = schema.getFieldNames();
57-
List<TypeDescription> fields = schema.getChildren();
58-
for (String column: selectedColumns.split((","))) {
59-
TypeDescription col = findColumn(column, fieldNames, fields);
60-
if (col != null) {
61-
for(int i=col.getId(); i <= col.getMaximumId(); ++i) {
61+
baseSchema.getCategory() == TypeDescription.Category.STRUCT) {
62+
63+
for (String columnName : selectedColumns.split(COMMA_STR)) {
64+
TypeDescription column = findColumn(baseSchema, columnName.trim());
65+
if (column != null) {
66+
for (int i = column.getId(); i <= column.getMaximumId(); ++i) {
6267
results[i] = true;
6368
}
6469
}
@@ -67,18 +72,33 @@ public static boolean[] includeColumns(String selectedColumns,
6772
return results;
6873
}
6974

70-
private static TypeDescription findColumn(String columnName,
71-
List<String> fieldNames,
72-
List<TypeDescription> fields) {
73-
int i = 0;
74-
for(String fieldName: fieldNames) {
75-
if (fieldName.equalsIgnoreCase(columnName)) {
76-
return fields.get(i);
77-
} else {
78-
i += 1;
75+
private static TypeDescription findColumn(TypeDescription schema, String column) {
76+
TypeDescription result = schema;
77+
String[] columnMatcher = column.split("\\.");
78+
79+
int index = 0;
80+
while (index < columnMatcher.length &&
81+
result.getCategory() == TypeDescription.Category.STRUCT) {
82+
83+
String columnName = columnMatcher[index];
84+
int prevIndex = index;
85+
86+
List<TypeDescription> fields = result.getChildren();
87+
List<String> fieldNames = result.getFieldNames();
88+
89+
for (int i = 0; i < fields.size(); i++) {
90+
if (columnName.equalsIgnoreCase(fieldNames.get(i))) {
91+
result = fields.get(i);
92+
index++;
93+
94+
break;
95+
}
96+
}
97+
if (prevIndex == index) {
98+
return null;
7999
}
80100
}
81-
return null;
101+
return result;
82102
}
83103

84104
public static List<OrcProto.Type> getOrcTypes(TypeDescription typeDescr) {

java/core/src/java/org/apache/orc/impl/SchemaEvolution.java

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -581,7 +581,7 @@ void buildIdentityConversion(TypeDescription readerType) {
581581
}
582582
}
583583

584-
private static boolean checkAcidSchema(TypeDescription type) {
584+
public static boolean checkAcidSchema(TypeDescription type) {
585585
if (type.getCategory().equals(TypeDescription.Category.STRUCT)) {
586586
List<String> rootFields = type.getFieldNames();
587587
if (rootFields.size() != acidEventFieldNames.size()) {
@@ -617,7 +617,7 @@ public static TypeDescription createEventSchema(TypeDescription typeDescr) {
617617
* @param typeDescription the ACID event schema.
618618
* @return the subtype for the real row
619619
*/
620-
static TypeDescription getBaseRow(TypeDescription typeDescription) {
620+
public static TypeDescription getBaseRow(TypeDescription typeDescription) {
621621
final int ACID_ROW_OFFSET = 5;
622622
return typeDescription.getChildren().get(ACID_ROW_OFFSET);
623623
}
Lines changed: 100 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,100 @@
1+
/**
2+
* Licensed to the Apache Software Foundation (ASF) under one
3+
* or more contributor license agreements. See the NOTICE file
4+
* distributed with this work for additional information
5+
* regarding copyright ownership. The ASF licenses this file
6+
* to you under the Apache License, Version 2.0 (the
7+
* "License"); you may not use this file except in compliance
8+
* with the License. You may obtain a copy of the License at
9+
*
10+
* http://www.apache.org/licenses/LICENSE-2.0
11+
*
12+
* Unless required by applicable law or agreed to in writing, software
13+
* distributed under the License is distributed on an "AS IS" BASIS,
14+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15+
* See the License for the specific language governing permissions and
16+
* limitations under the License.
17+
*/
18+
19+
package org.apache.orc.util;
20+
21+
import java.util.Arrays;
22+
23+
import org.apache.orc.OrcUtils;
24+
import org.apache.orc.TypeDescription;
25+
26+
import org.junit.Assert;
27+
import org.junit.Test;
28+
29+
/**
30+
* Tests for OrcUtils.
31+
*/
32+
public class TestOrcUtils {
33+
34+
@Test
35+
public void testBloomFilterIncludeColumns() {
36+
TypeDescription schema = TypeDescription.createStruct()
37+
.addField("msisdn", TypeDescription.createString())
38+
.addField("imsi", TypeDescription.createVarchar())
39+
.addField("imei", TypeDescription.createInt());
40+
41+
boolean[] includeColumns = new boolean[3+1];
42+
includeColumns[1] = true;
43+
includeColumns[3] = true;
44+
45+
Assert.assertTrue(Arrays.equals(includeColumns,
46+
OrcUtils.includeColumns("msisdn, imei", schema)));
47+
}
48+
49+
@Test
50+
public void testBloomFilterIncludeColumns_ACID() {
51+
TypeDescription rowSchema = TypeDescription.createStruct()
52+
.addField("msisdn", TypeDescription.createString())
53+
.addField("imei", TypeDescription.createInt());
54+
55+
TypeDescription schema = TypeDescription.createStruct()
56+
.addField("operation", TypeDescription.createString())
57+
.addField("originalTransaction", TypeDescription.createInt())
58+
.addField("bucket", TypeDescription.createInt())
59+
.addField("rowId", TypeDescription.createInt())
60+
.addField("currentTransaction", TypeDescription.createInt())
61+
.addField("row", rowSchema);
62+
63+
boolean[] includeColumns = new boolean[8+1];
64+
includeColumns[7] = true;
65+
66+
Assert.assertTrue(Arrays.equals(includeColumns,
67+
OrcUtils.includeColumns("msisdn", schema)));
68+
}
69+
70+
@Test
71+
public void testBloomFilterIncludeColumns_Nested() {
72+
TypeDescription rowSchema = TypeDescription.createStruct()
73+
.addField("msisdn", TypeDescription.createString())
74+
.addField("imei", TypeDescription.createInt());
75+
76+
TypeDescription schema = TypeDescription.createStruct()
77+
.addField("row", rowSchema);
78+
79+
boolean[] includeColumns = new boolean[3+1];
80+
includeColumns[2] = true;
81+
82+
Assert.assertTrue(Arrays.equals(includeColumns,
83+
OrcUtils.includeColumns("row.msisdn", schema)));
84+
}
85+
86+
@Test
87+
public void testBloomFilterIncludeColumns_NonExisting() {
88+
TypeDescription rowSchema = TypeDescription.createStruct()
89+
.addField("msisdn", TypeDescription.createString())
90+
.addField("imei", TypeDescription.createInt());
91+
92+
TypeDescription schema = TypeDescription.createStruct()
93+
.addField("row", rowSchema);
94+
95+
boolean[] includeColumns = new boolean[3+1];
96+
97+
Assert.assertTrue(Arrays.equals(includeColumns,
98+
OrcUtils.includeColumns("msisdn, row.msisdn2", schema)));
99+
}
100+
}

0 commit comments

Comments
 (0)