diff --git a/java/core/src/java/org/apache/orc/OrcUtils.java b/java/core/src/java/org/apache/orc/OrcUtils.java index 76e5e16fd5..220fa1338e 100644 --- a/java/core/src/java/org/apache/orc/OrcUtils.java +++ b/java/core/src/java/org/apache/orc/OrcUtils.java @@ -18,12 +18,15 @@ package org.apache.orc; import org.apache.orc.impl.ReaderImpl; +import org.apache.orc.impl.SchemaEvolution; import java.io.IOException; import java.util.ArrayList; import java.util.Arrays; import java.util.List; +import static org.apache.hadoop.util.StringUtils.COMMA_STR; + public class OrcUtils { /** @@ -51,14 +54,16 @@ public static boolean[] includeColumns(String selectedColumns, Arrays.fill(results, true); return results; } + TypeDescription baseSchema = SchemaEvolution.checkAcidSchema(schema) ? + SchemaEvolution.getBaseRow(schema) : schema; + if (selectedColumns != null && - schema.getCategory() == TypeDescription.Category.STRUCT) { - List fieldNames = schema.getFieldNames(); - List fields = schema.getChildren(); - for (String column: selectedColumns.split((","))) { - TypeDescription col = findColumn(column, fieldNames, fields); - if (col != null) { - for(int i=col.getId(); i <= col.getMaximumId(); ++i) { + baseSchema.getCategory() == TypeDescription.Category.STRUCT) { + + for (String columnName : selectedColumns.split(COMMA_STR)) { + TypeDescription column = findColumn(baseSchema, columnName.trim()); + if (column != null) { + for (int i = column.getId(); i <= column.getMaximumId(); ++i) { results[i] = true; } } @@ -67,18 +72,33 @@ public static boolean[] includeColumns(String selectedColumns, return results; } - private static TypeDescription findColumn(String columnName, - List fieldNames, - List fields) { - int i = 0; - for(String fieldName: fieldNames) { - if (fieldName.equalsIgnoreCase(columnName)) { - return fields.get(i); - } else { - i += 1; + private static TypeDescription findColumn(TypeDescription schema, String column) { + TypeDescription result = schema; + String[] columnMatcher = column.split("\\."); + + int index = 0; + while (index < columnMatcher.length && + result.getCategory() == TypeDescription.Category.STRUCT) { + + String columnName = columnMatcher[index]; + int prevIndex = index; + + List fields = result.getChildren(); + List fieldNames = result.getFieldNames(); + + for (int i = 0; i < fields.size(); i++) { + if (columnName.equalsIgnoreCase(fieldNames.get(i))) { + result = fields.get(i); + index++; + + break; + } + } + if (prevIndex == index) { + return null; } } - return null; + return result; } public static List getOrcTypes(TypeDescription typeDescr) { diff --git a/java/core/src/java/org/apache/orc/impl/SchemaEvolution.java b/java/core/src/java/org/apache/orc/impl/SchemaEvolution.java index aa7ba9cb36..1d4cc67989 100644 --- a/java/core/src/java/org/apache/orc/impl/SchemaEvolution.java +++ b/java/core/src/java/org/apache/orc/impl/SchemaEvolution.java @@ -581,7 +581,7 @@ void buildIdentityConversion(TypeDescription readerType) { } } - private static boolean checkAcidSchema(TypeDescription type) { + public static boolean checkAcidSchema(TypeDescription type) { if (type.getCategory().equals(TypeDescription.Category.STRUCT)) { List rootFields = type.getFieldNames(); if (rootFields.size() != acidEventFieldNames.size()) { @@ -617,7 +617,7 @@ public static TypeDescription createEventSchema(TypeDescription typeDescr) { * @param typeDescription the ACID event schema. * @return the subtype for the real row */ - static TypeDescription getBaseRow(TypeDescription typeDescription) { + public static TypeDescription getBaseRow(TypeDescription typeDescription) { final int ACID_ROW_OFFSET = 5; return typeDescription.getChildren().get(ACID_ROW_OFFSET); } diff --git a/java/core/src/test/org/apache/orc/util/TestOrcUtils.java b/java/core/src/test/org/apache/orc/util/TestOrcUtils.java new file mode 100644 index 0000000000..26c5f0875e --- /dev/null +++ b/java/core/src/test/org/apache/orc/util/TestOrcUtils.java @@ -0,0 +1,100 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.orc.util; + +import java.util.Arrays; + +import org.apache.orc.OrcUtils; +import org.apache.orc.TypeDescription; + +import org.junit.Assert; +import org.junit.Test; + +/** + * Tests for OrcUtils. + */ +public class TestOrcUtils { + + @Test + public void testBloomFilterIncludeColumns() { + TypeDescription schema = TypeDescription.createStruct() + .addField("msisdn", TypeDescription.createString()) + .addField("imsi", TypeDescription.createVarchar()) + .addField("imei", TypeDescription.createInt()); + + boolean[] includeColumns = new boolean[3+1]; + includeColumns[1] = true; + includeColumns[3] = true; + + Assert.assertTrue(Arrays.equals(includeColumns, + OrcUtils.includeColumns("msisdn, imei", schema))); + } + + @Test + public void testBloomFilterIncludeColumns_ACID() { + TypeDescription rowSchema = TypeDescription.createStruct() + .addField("msisdn", TypeDescription.createString()) + .addField("imei", TypeDescription.createInt()); + + TypeDescription schema = TypeDescription.createStruct() + .addField("operation", TypeDescription.createString()) + .addField("originalTransaction", TypeDescription.createInt()) + .addField("bucket", TypeDescription.createInt()) + .addField("rowId", TypeDescription.createInt()) + .addField("currentTransaction", TypeDescription.createInt()) + .addField("row", rowSchema); + + boolean[] includeColumns = new boolean[8+1]; + includeColumns[7] = true; + + Assert.assertTrue(Arrays.equals(includeColumns, + OrcUtils.includeColumns("msisdn", schema))); + } + + @Test + public void testBloomFilterIncludeColumns_Nested() { + TypeDescription rowSchema = TypeDescription.createStruct() + .addField("msisdn", TypeDescription.createString()) + .addField("imei", TypeDescription.createInt()); + + TypeDescription schema = TypeDescription.createStruct() + .addField("row", rowSchema); + + boolean[] includeColumns = new boolean[3+1]; + includeColumns[2] = true; + + Assert.assertTrue(Arrays.equals(includeColumns, + OrcUtils.includeColumns("row.msisdn", schema))); + } + + @Test + public void testBloomFilterIncludeColumns_NonExisting() { + TypeDescription rowSchema = TypeDescription.createStruct() + .addField("msisdn", TypeDescription.createString()) + .addField("imei", TypeDescription.createInt()); + + TypeDescription schema = TypeDescription.createStruct() + .addField("row", rowSchema); + + boolean[] includeColumns = new boolean[3+1]; + + Assert.assertTrue(Arrays.equals(includeColumns, + OrcUtils.includeColumns("msisdn, row.msisdn2", schema))); + } +}