diff --git a/java/vector/src/main/java/org/apache/arrow/vector/dictionary/DictionaryEncoder.java b/java/vector/src/main/java/org/apache/arrow/vector/dictionary/DictionaryEncoder.java index 477b1511f84..d0bb268f35c 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/dictionary/DictionaryEncoder.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/dictionary/DictionaryEncoder.java @@ -199,7 +199,10 @@ public ValueVector encode(ValueVector vector) { } /** - * Decodes a vector with the built hash table in this encoder. + * Decodes a vector with the dictionary in this encoder. + * + * {@link DictionaryEncoder#decode(ValueVector, Dictionary, BufferAllocator)} should be used instead if only decoding + * is required as it can avoid building the {@link DictionaryHashTable} which only makes sense when encoding. */ public ValueVector decode(ValueVector indices) { return decode(indices, dictionary, allocator); diff --git a/java/vector/src/main/java/org/apache/arrow/vector/dictionary/ListSubfieldEncoder.java b/java/vector/src/main/java/org/apache/arrow/vector/dictionary/ListSubfieldEncoder.java index dd2bb26e3af..00d7c8af179 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/dictionary/ListSubfieldEncoder.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/dictionary/ListSubfieldEncoder.java @@ -54,11 +54,11 @@ public ListSubfieldEncoder(Dictionary dictionary, BufferAllocator allocator, Arr hashTable = new DictionaryHashTable(getDataVector(dictVector), hasher); } - private FieldVector getDataVector(BaseListVector vector) { + private static FieldVector getDataVector(BaseListVector vector) { return vector.getChildrenFromFields().get(0); } - private BaseListVector cloneVector(BaseListVector vector) { + private static BaseListVector cloneVector(BaseListVector vector, BufferAllocator allocator) { final FieldType fieldType = vector.getField().getFieldType(); BaseListVector cloned = (BaseListVector) fieldType.createNewSingleVector(vector.getField().getName(), @@ -84,7 +84,7 @@ public BaseListVector encodeListSubField(BaseListVector vector) { Field valueField = new Field(vector.getField().getName(), indexFieldType, null); // clone list vector and initialize data vector - BaseListVector encoded = cloneVector(vector); + BaseListVector encoded = cloneVector(vector, allocator); encoded.initializeChildrenFromFields(Collections.singletonList(valueField)); BaseIntVector indices = (BaseIntVector) getDataVector(encoded); @@ -103,17 +103,35 @@ public BaseListVector encodeListSubField(BaseListVector vector) { /** * Decodes a dictionary subfields encoded vector using the provided dictionary. + * + * {@link ListSubfieldEncoder#decodeListSubField(BaseListVector, Dictionary, BufferAllocator)} should be used instead + * if only decoding is required as it can avoid building the {@link DictionaryHashTable} which only makes sense when + * encoding. + * * @param vector dictionary encoded vector, its data vector must be int type * @return vector with values restored from dictionary */ public BaseListVector decodeListSubField(BaseListVector vector) { + return decodeListSubField(vector, dictionary, allocator); + } + /** + * Decodes a dictionary subfields encoded vector using the provided dictionary. + * + * @param vector dictionary encoded vector, its data vector must be int type + * @param dictionary dictionary used to decode the values + * @param allocator allocator the decoded values use + * @return vector with values restored from dictionary + */ + public static BaseListVector decodeListSubField(BaseListVector vector, + Dictionary dictionary, + BufferAllocator allocator) { int valueCount = vector.getValueCount(); BaseListVector dictionaryVector = (BaseListVector) dictionary.getVector(); int dictionaryValueCount = getDataVector(dictionaryVector).getValueCount(); // clone list vector and initialize data vector - BaseListVector decoded = cloneVector(vector); + BaseListVector decoded = cloneVector(vector, allocator); Field dataVectorField = getDataVector(dictionaryVector).getField(); decoded.initializeChildrenFromFields(Collections.singletonList(dataVectorField)); diff --git a/java/vector/src/main/java/org/apache/arrow/vector/dictionary/StructSubfieldEncoder.java b/java/vector/src/main/java/org/apache/arrow/vector/dictionary/StructSubfieldEncoder.java index 6542b298d7d..d19e261490d 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/dictionary/StructSubfieldEncoder.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/dictionary/StructSubfieldEncoder.java @@ -70,11 +70,11 @@ public StructSubfieldEncoder( dictionaryIdToHashTable.put(id, new DictionaryHashTable(provider.lookup(id).getVector(), hasher))); } - private FieldVector getChildVector(StructVector vector, int index) { + private static FieldVector getChildVector(StructVector vector, int index) { return vector.getChildrenFromFields().get(index); } - private StructVector cloneVector(StructVector vector) { + private static StructVector cloneVector(StructVector vector, BufferAllocator allocator) { final FieldType fieldType = vector.getField().getFieldType(); StructVector cloned = (StructVector) fieldType.createNewSingleVector( @@ -117,7 +117,7 @@ public StructVector encode(StructVector vector, Map columnToDicti } // clone list vector and initialize data vector - StructVector encoded = cloneVector(vector); + StructVector encoded = cloneVector(vector, allocator); encoded.initializeChildrenFromFields(childrenFields); encoded.setValueCount(valueCount); @@ -139,20 +139,38 @@ public StructVector encode(StructVector vector, Map columnToDicti /** * Decodes a dictionary subfields encoded vector using the provided dictionary. + * + * {@link StructSubfieldEncoder#decode(StructVector, DictionaryProvider.MapDictionaryProvider, BufferAllocator)} + * should be used instead if only decoding is required as it can avoid building the {@link DictionaryHashTable} + * which only makes sense when encoding. + * * @param vector dictionary encoded vector, its child vector must be int type * @return vector with values restored from dictionary */ public StructVector decode(StructVector vector) { + return decode(vector, provider, allocator); + } + /** + * Decodes a dictionary subfields encoded vector using the provided dictionary. + * + * @param vector dictionary encoded vector, its data vector must be int type + * @param provider dictionary provider used to decode the values + * @param allocator allocator the decoded values use + * @return vector with values restored from dictionary + */ + public static StructVector decode(StructVector vector, + DictionaryProvider.MapDictionaryProvider provider, + BufferAllocator allocator) { final int valueCount = vector.getValueCount(); final int childCount = vector.getChildrenFromFields().size(); // clone list vector and initialize child vectors - StructVector decoded = cloneVector(vector); + StructVector decoded = cloneVector(vector, allocator); List childFields = new ArrayList<>(); for (int i = 0; i < childCount; i++) { FieldVector childVector = getChildVector(vector, i); - Dictionary dictionary = getChildVectorDictionary(childVector); + Dictionary dictionary = getChildVectorDictionary(childVector, provider); // childVector is not encoded. if (dictionary == null) { childFields.add(childVector.getField()); @@ -167,7 +185,7 @@ public StructVector decode(StructVector vector) { // get child vector FieldVector childVector = getChildVector(vector, index); FieldVector decodedChildVector = getChildVector(decoded, index); - Dictionary dictionary = getChildVectorDictionary(childVector); + Dictionary dictionary = getChildVectorDictionary(childVector, provider); if (dictionary == null) { childVector.makeTransferPair(decodedChildVector).splitAndTransfer(0, valueCount); } else { @@ -184,7 +202,8 @@ public StructVector decode(StructVector vector) { /** * Get the child vector dictionary, return null if not dictionary encoded. */ - private Dictionary getChildVectorDictionary(FieldVector childVector) { + private static Dictionary getChildVectorDictionary(FieldVector childVector, + DictionaryProvider.MapDictionaryProvider provider) { DictionaryEncoding dictionaryEncoding = childVector.getField().getDictionary(); if (dictionaryEncoding != null) { Dictionary dictionary = provider.lookup(dictionaryEncoding.getId());