diff --git a/docker/compose/docker-compose_hadoop284_hive233_spark244.yml b/docker/compose/docker-compose_hadoop284_hive233_spark244.yml index 3c1acbdfe771..086004f121e9 100644 --- a/docker/compose/docker-compose_hadoop284_hive233_spark244.yml +++ b/docker/compose/docker-compose_hadoop284_hive233_spark244.yml @@ -184,7 +184,7 @@ services: presto-coordinator-1: container_name: presto-coordinator-1 hostname: presto-coordinator-1 - image: apachehudi/hudi-hadoop_2.8.4-prestobase_0.268:latest + image: apachehudi/hudi-hadoop_2.8.4-prestobase_0.271:latest ports: - '8090:8090' environment: @@ -201,25 +201,25 @@ services: command: coordinator presto-worker-1: - container_name: presto-worker-1 - hostname: presto-worker-1 - image: apachehudi/hudi-hadoop_2.8.4-prestobase_0.268:latest - depends_on: ["presto-coordinator-1"] - environment: - - PRESTO_JVM_MAX_HEAP=512M - - PRESTO_QUERY_MAX_MEMORY=1GB - - PRESTO_QUERY_MAX_MEMORY_PER_NODE=256MB - - PRESTO_QUERY_MAX_TOTAL_MEMORY_PER_NODE=384MB - - PRESTO_MEMORY_HEAP_HEADROOM_PER_NODE=100MB - - TERM=xterm - links: - - "hivemetastore" - - "hiveserver" - - "hive-metastore-postgresql" - - "namenode" - volumes: - - ${HUDI_WS}:/var/hoodie/ws - command: worker + container_name: presto-worker-1 + hostname: presto-worker-1 + image: apachehudi/hudi-hadoop_2.8.4-prestobase_0.271:latest + depends_on: [ "presto-coordinator-1" ] + environment: + - PRESTO_JVM_MAX_HEAP=512M + - PRESTO_QUERY_MAX_MEMORY=1GB + - PRESTO_QUERY_MAX_MEMORY_PER_NODE=256MB + - PRESTO_QUERY_MAX_TOTAL_MEMORY_PER_NODE=384MB + - PRESTO_MEMORY_HEAP_HEADROOM_PER_NODE=100MB + - TERM=xterm + links: + - "hivemetastore" + - "hiveserver" + - "hive-metastore-postgresql" + - "namenode" + volumes: + - ${HUDI_WS}:/var/hoodie/ws + command: worker trino-coordinator-1: container_name: trino-coordinator-1 diff --git a/docker/demo/trino-batch1.commands b/docker/demo/trino-batch1.commands new file mode 100644 index 000000000000..d89c19b0bf0b --- /dev/null +++ b/docker/demo/trino-batch1.commands @@ -0,0 +1,23 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +select symbol, max(ts) from stock_ticks_cow group by symbol HAVING symbol = 'GOOG'; +select symbol, max(ts) from stock_ticks_mor_ro group by symbol HAVING symbol = 'GOOG'; +select symbol, ts, volume, open, close from stock_ticks_cow where symbol = 'GOOG'; +select symbol, ts, volume, open, close from stock_ticks_mor_ro where symbol = 'GOOG'; diff --git a/docker/demo/trino-batch2-after-compaction.commands b/docker/demo/trino-batch2-after-compaction.commands new file mode 100644 index 000000000000..da42b4728252 --- /dev/null +++ b/docker/demo/trino-batch2-after-compaction.commands @@ -0,0 +1,21 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +select symbol, max(ts) from stock_ticks_mor_ro group by symbol HAVING symbol = 'GOOG'; +select symbol, ts, volume, open, close from stock_ticks_mor_ro where symbol = 'GOOG'; diff --git a/docker/demo/trino-table-check.commands b/docker/demo/trino-table-check.commands new file mode 100644 index 000000000000..4362d79fe770 --- /dev/null +++ b/docker/demo/trino-table-check.commands @@ -0,0 +1,20 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +show tables; diff --git a/docker/hoodie/hadoop/pom.xml b/docker/hoodie/hadoop/pom.xml index de3bd3d57832..b029abafa83e 100644 --- a/docker/hoodie/hadoop/pom.xml +++ b/docker/hoodie/hadoop/pom.xml @@ -57,7 +57,7 @@ 2.4.4 2.3.3 2.8.4 - 0.268 + 0.271 368 1.4.13 true diff --git a/docker/hoodie/hadoop/prestobase/Dockerfile b/docker/hoodie/hadoop/prestobase/Dockerfile index 12b644aa0631..9d0ce90bad6c 100644 --- a/docker/hoodie/hadoop/prestobase/Dockerfile +++ b/docker/hoodie/hadoop/prestobase/Dockerfile @@ -22,7 +22,7 @@ ARG HADOOP_VERSION=2.8.4 ARG HIVE_VERSION=2.3.3 FROM apachehudi/hudi-hadoop_${HADOOP_VERSION}-base:latest as hadoop-base -ARG PRESTO_VERSION=0.268 +ARG PRESTO_VERSION=0.271 ENV PRESTO_VERSION ${PRESTO_VERSION} ENV PRESTO_HOME /opt/presto-server-${PRESTO_VERSION} @@ -79,6 +79,13 @@ RUN chmod +x /usr/local/bin/entrypoint.sh ADD target/ /var/hoodie/ws/docker/hoodie/hadoop/prestobase/target/ ENV HUDI_PRESTO_BUNDLE /var/hoodie/ws/docker/hoodie/hadoop/prestobase/target/hudi-presto-bundle.jar RUN cp ${HUDI_PRESTO_BUNDLE} ${PRESTO_HOME}/plugin/hive-hadoop2/ +# TODO: the latest master of Presto relies on hudi-presto-bundle instead of the hudi-common +# and hudi-hadoop-mr. To get around the conflicts due to older Hudi jars below, they are +# removed for integration tests, so the hudi-presto-bundle build can be used solely for testing. +# This temporary logic must be removed once Presto has a new release depending on +# hudi-presto-bundle and we upgrade docker setup to that release version. +RUN rm ${PRESTO_HOME}/plugin/hive-hadoop2/hudi-common-* +RUN rm ${PRESTO_HOME}/plugin/hive-hadoop2/hudi-hadoop-mr-* VOLUME ["${PRESTO_LOG_DIR}"] diff --git a/hudi-aws/pom.xml b/hudi-aws/pom.xml index d44a389a61f6..4abbd119a0d5 100644 --- a/hudi-aws/pom.xml +++ b/hudi-aws/pom.xml @@ -48,6 +48,25 @@ + + org.apache.hadoop + hadoop-common + provided + + + org.mortbay.jetty + * + + + javax.servlet.jsp + * + + + javax.servlet + * + + + org.apache.hadoop hadoop-common diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/storage/HoodieHFileConfig.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/storage/HoodieHFileConfig.java index 1079566b782f..5ce377901a4b 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/storage/HoodieHFileConfig.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/storage/HoodieHFileConfig.java @@ -21,14 +21,14 @@ import org.apache.hudi.common.bloom.BloomFilter; import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.hbase.CellComparator; import org.apache.hadoop.hbase.HColumnDescriptor; -import org.apache.hadoop.hbase.KeyValue; import org.apache.hadoop.hbase.io.compress.Compression; import org.apache.hadoop.hbase.io.hfile.CacheConfig; public class HoodieHFileConfig { - public static final KeyValue.KVComparator HFILE_COMPARATOR = new HoodieHBaseKVComparator(); + public static final CellComparator HFILE_COMPARATOR = new HoodieHBaseKVComparator(); public static final boolean PREFETCH_ON_OPEN = CacheConfig.DEFAULT_PREFETCH_ON_OPEN; public static final boolean CACHE_DATA_IN_L1 = HColumnDescriptor.DEFAULT_CACHE_DATA_IN_L1; // This is private in CacheConfig so have been copied here. @@ -42,12 +42,12 @@ public class HoodieHFileConfig { private final boolean dropBehindCacheCompaction; private final Configuration hadoopConf; private final BloomFilter bloomFilter; - private final KeyValue.KVComparator hfileComparator; + private final CellComparator hfileComparator; private final String keyFieldName; public HoodieHFileConfig(Configuration hadoopConf, Compression.Algorithm compressionAlgorithm, int blockSize, long maxFileSize, String keyFieldName, boolean prefetchBlocksOnOpen, boolean cacheDataInL1, - boolean dropBehindCacheCompaction, BloomFilter bloomFilter, KeyValue.KVComparator hfileComparator) { + boolean dropBehindCacheCompaction, BloomFilter bloomFilter, CellComparator hfileComparator) { this.hadoopConf = hadoopConf; this.compressionAlgorithm = compressionAlgorithm; this.blockSize = blockSize; @@ -96,7 +96,7 @@ public BloomFilter getBloomFilter() { return bloomFilter; } - public KeyValue.KVComparator getHfileComparator() { + public CellComparator getHFileComparator() { return hfileComparator; } diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/storage/HoodieHFileWriter.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/storage/HoodieHFileWriter.java index 2ad6d7f9220b..5dcd2e0a32e5 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/storage/HoodieHFileWriter.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/storage/HoodieHFileWriter.java @@ -25,6 +25,8 @@ import org.apache.hudi.common.fs.HoodieWrapperFileSystem; import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.model.HoodieRecordPayload; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.common.util.StringUtils; import org.apache.avro.Schema; import org.apache.avro.generic.GenericRecord; @@ -38,8 +40,6 @@ import org.apache.hadoop.hbase.io.hfile.HFileContext; import org.apache.hadoop.hbase.io.hfile.HFileContextBuilder; import org.apache.hadoop.io.Writable; -import org.apache.hudi.common.util.Option; -import org.apache.hudi.common.util.StringUtils; import java.io.DataInput; import java.io.DataOutput; @@ -95,6 +95,7 @@ public HoodieHFileWriter(String instantTime, Path file, HoodieHFileConfig hfileC HFileContext context = new HFileContextBuilder().withBlockSize(hfileConfig.getBlockSize()) .withCompression(hfileConfig.getCompressionAlgorithm()) + .withCellComparator(hfileConfig.getHFileComparator()) .build(); conf.set(CacheConfig.PREFETCH_BLOCKS_ON_OPEN_KEY, String.valueOf(hfileConfig.shouldPrefetchBlocksOnOpen())); @@ -104,7 +105,6 @@ public HoodieHFileWriter(String instantTime, Path file, HoodieHFileConfig hfileC this.writer = HFile.getWriterFactory(conf, cacheConfig) .withPath(this.fs, this.file) .withFileContext(context) - .withComparator(hfileConfig.getHfileComparator()) .create(); writer.appendFileInfo(HoodieHFileReader.KEY_SCHEMA.getBytes(), schema.toString().getBytes()); diff --git a/hudi-client/hudi-java-client/pom.xml b/hudi-client/hudi-java-client/pom.xml index 3471bfb8ba36..b299150c6e3e 100644 --- a/hudi-client/hudi-java-client/pom.xml +++ b/hudi-client/hudi-java-client/pom.xml @@ -122,6 +122,26 @@ test + + org.apache.hadoop + hadoop-hdfs + test + + + + org.mortbay.jetty + * + + + javax.servlet.jsp + * + + + javax.servlet + * + + + org.apache.hadoop hadoop-hdfs diff --git a/hudi-client/hudi-spark-client/pom.xml b/hudi-client/hudi-spark-client/pom.xml index d6c60cb61bc4..20cccda5ea42 100644 --- a/hudi-client/hudi-spark-client/pom.xml +++ b/hudi-client/hudi-spark-client/pom.xml @@ -90,6 +90,12 @@ + + org.apache.hadoop + hadoop-hdfs-client + ${hadoop.version} + test + org.apache.hbase hbase-testing-util @@ -110,6 +116,12 @@ + + org.apache.zookeeper + zookeeper + ${zookeeper.version} + test + diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/testutils/HoodieClientTestUtils.java b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/testutils/HoodieClientTestUtils.java index 05d7f99446e9..8a3abfd6e1cb 100644 --- a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/testutils/HoodieClientTestUtils.java +++ b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/testutils/HoodieClientTestUtils.java @@ -66,6 +66,8 @@ import java.util.stream.Collectors; import java.util.stream.Stream; +import static org.apache.hudi.io.storage.HoodieHFileReader.KEY_SCHEMA; + /** * Utility methods to aid testing inside the HoodieClient module. */ @@ -241,9 +243,9 @@ public static Stream readHFile(JavaSparkContext jsc, String[] pat Schema schema = null; for (String path : paths) { try { - HFile.Reader reader = HFile.createReader(fs, new Path(path), cacheConfig, fs.getConf()); + HFile.Reader reader = HFile.createReader(fs, new Path(path), cacheConfig, true, fs.getConf()); if (schema == null) { - schema = new Schema.Parser().parse(new String(reader.loadFileInfo().get("schema".getBytes()))); + schema = new Schema.Parser().parse(new String(reader.getHFileInfo().get(KEY_SCHEMA.getBytes()))); } HFileScanner scanner = reader.getScanner(false, false); if (!scanner.seekTo()) { @@ -252,7 +254,7 @@ public static Stream readHFile(JavaSparkContext jsc, String[] pat } do { - Cell c = scanner.getKeyValue(); + Cell c = scanner.getCell(); byte[] value = Arrays.copyOfRange(c.getValueArray(), c.getValueOffset(), c.getValueOffset() + c.getValueLength()); valuesAsList.add(HoodieAvroUtils.bytesToAvro(value, schema)); } while (scanner.next()); diff --git a/hudi-common/pom.xml b/hudi-common/pom.xml index 1a558aeae332..b5b2a3a47a4a 100644 --- a/hudi-common/pom.xml +++ b/hudi-common/pom.xml @@ -221,14 +221,13 @@ org.apache.hbase hbase-client ${hbase.version} - test org.apache.hbase hbase-server ${hbase.version} - + compile diff --git a/hudi-common/src/main/java/org/apache/hudi/common/bootstrap/index/HFileBootstrapIndex.java b/hudi-common/src/main/java/org/apache/hudi/common/bootstrap/index/HFileBootstrapIndex.java index 3700d01a60ea..7f36a47a4d24 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/bootstrap/index/HFileBootstrapIndex.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/bootstrap/index/HFileBootstrapIndex.java @@ -37,6 +37,7 @@ import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hbase.CellComparatorImpl; import org.apache.hadoop.hbase.CellUtil; import org.apache.hadoop.hbase.HConstants; import org.apache.hadoop.hbase.KeyValue; @@ -178,9 +179,7 @@ private static String getUserKeyFromCellKey(String cellKey) { private static HFile.Reader createReader(String hFilePath, Configuration conf, FileSystem fileSystem) { try { LOG.info("Opening HFile for reading :" + hFilePath); - HFile.Reader reader = HFile.createReader(fileSystem, new HFilePathForReader(hFilePath), - new CacheConfig(conf), conf); - return reader; + return HFile.createReader(fileSystem, new HFilePathForReader(hFilePath), new CacheConfig(conf), true, conf); } catch (IOException ioe) { throw new HoodieIOException(ioe.getMessage(), ioe); } @@ -259,7 +258,7 @@ private void initIndexInfo() { private HoodieBootstrapIndexInfo fetchBootstrapIndexInfo() throws IOException { return TimelineMetadataUtils.deserializeAvroMetadata( - partitionIndexReader().loadFileInfo().get(INDEX_INFO_KEY), + partitionIndexReader().getHFileInfo().get(INDEX_INFO_KEY), HoodieBootstrapIndexInfo.class); } @@ -306,7 +305,7 @@ private List getAllKeys(HFileScanner scanner, Function convert try { boolean available = scanner.seekTo(); while (available) { - keys.add(converter.apply(getUserKeyFromCellKey(CellUtil.getCellKeyAsString(scanner.getKeyValue())))); + keys.add(converter.apply(getUserKeyFromCellKey(CellUtil.getCellKeyAsString(scanner.getCell())))); available = scanner.next(); } } catch (IOException ioe) { @@ -528,13 +527,13 @@ public void close() { @Override public void begin() { try { - HFileContext meta = new HFileContextBuilder().build(); + HFileContext meta = new HFileContextBuilder().withCellComparator(new HoodieKVComparator()).build(); this.indexByPartitionWriter = HFile.getWriterFactory(metaClient.getHadoopConf(), new CacheConfig(metaClient.getHadoopConf())).withPath(metaClient.getFs(), indexByPartitionPath) - .withFileContext(meta).withComparator(new HoodieKVComparator()).create(); + .withFileContext(meta).create(); this.indexByFileIdWriter = HFile.getWriterFactory(metaClient.getHadoopConf(), new CacheConfig(metaClient.getHadoopConf())).withPath(metaClient.getFs(), indexByFileIdPath) - .withFileContext(meta).withComparator(new HoodieKVComparator()).create(); + .withFileContext(meta).create(); } catch (IOException ioe) { throw new HoodieIOException(ioe.getMessage(), ioe); } @@ -581,6 +580,6 @@ public String getName() { * This class is explicitly used as Key Comparator to workaround hard coded * legacy format class names inside HBase. Otherwise we will face issues with shading. */ - public static class HoodieKVComparator extends KeyValue.KVComparator { + public static class HoodieKVComparator extends CellComparatorImpl { } } diff --git a/hudi-common/src/main/java/org/apache/hudi/common/table/log/AbstractHoodieLogRecordReader.java b/hudi-common/src/main/java/org/apache/hudi/common/table/log/AbstractHoodieLogRecordReader.java index fa5117e41fa7..6a0b10fe07ea 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/table/log/AbstractHoodieLogRecordReader.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/table/log/AbstractHoodieLogRecordReader.java @@ -424,6 +424,9 @@ private void processQueuedBlocksForInstant(Deque logBlocks, int processDataBlock((HoodieAvroDataBlock) lastBlock, keys); break; case HFILE_DATA_BLOCK: + if (!keys.isPresent()) { + keys = Option.of(Collections.emptyList()); + } processDataBlock((HoodieHFileDataBlock) lastBlock, keys); break; case PARQUET_DATA_BLOCK: diff --git a/hudi-common/src/main/java/org/apache/hudi/common/table/log/block/HoodieHFileDataBlock.java b/hudi-common/src/main/java/org/apache/hudi/common/table/log/block/HoodieHFileDataBlock.java index 557a0db7cbfa..e843ad74cb31 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/table/log/block/HoodieHFileDataBlock.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/table/log/block/HoodieHFileDataBlock.java @@ -18,6 +18,17 @@ package org.apache.hudi.common.table.log.block; +import org.apache.hudi.avro.HoodieAvroUtils; +import org.apache.hudi.common.fs.inline.InLineFSUtils; +import org.apache.hudi.common.fs.inline.InLineFileSystem; +import org.apache.hudi.common.util.ClosableIterator; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.common.util.StringUtils; +import org.apache.hudi.common.util.ValidationUtils; +import org.apache.hudi.exception.HoodieIOException; +import org.apache.hudi.io.storage.HoodieHBaseKVComparator; +import org.apache.hudi.io.storage.HoodieHFileReader; + import org.apache.avro.Schema; import org.apache.avro.generic.IndexedRecord; import org.apache.hadoop.conf.Configuration; @@ -30,17 +41,6 @@ import org.apache.hadoop.hbase.io.hfile.HFile; import org.apache.hadoop.hbase.io.hfile.HFileContext; import org.apache.hadoop.hbase.io.hfile.HFileContextBuilder; -import org.apache.hudi.avro.HoodieAvroUtils; -import org.apache.hudi.common.fs.inline.InLineFSUtils; -import org.apache.hudi.common.fs.inline.InLineFileSystem; -import org.apache.hudi.common.util.ClosableIterator; -import org.apache.hudi.common.util.Option; -import org.apache.hudi.common.util.StringUtils; -import org.apache.hudi.common.util.ValidationUtils; -import org.apache.hudi.exception.HoodieIOException; -import org.apache.hudi.io.storage.HoodieHBaseKVComparator; -import org.apache.hudi.io.storage.HoodieHFileReader; - import org.apache.log4j.LogManager; import org.apache.log4j.Logger; @@ -95,6 +95,7 @@ protected byte[] serializeRecords(List records) throws IOExceptio HFileContext context = new HFileContextBuilder() .withBlockSize(DEFAULT_BLOCK_SIZE) .withCompression(compressionAlgorithm.get()) + .withCellComparator(new HoodieHBaseKVComparator()) .build(); Configuration conf = new Configuration(); @@ -128,7 +129,7 @@ protected byte[] serializeRecords(List records) throws IOExceptio } HFile.Writer writer = HFile.getWriterFactory(conf, cacheConfig) - .withOutputStream(ostream).withFileContext(context).withComparator(new HoodieHBaseKVComparator()).create(); + .withOutputStream(ostream).withFileContext(context).create(); // Write the records sortedRecordsMap.forEach((recordKey, recordBytes) -> { diff --git a/hudi-common/src/main/java/org/apache/hudi/io/storage/HoodieHBaseKVComparator.java b/hudi-common/src/main/java/org/apache/hudi/io/storage/HoodieHBaseKVComparator.java index 2d4d96959e15..aaf1dcd7037b 100644 --- a/hudi-common/src/main/java/org/apache/hudi/io/storage/HoodieHBaseKVComparator.java +++ b/hudi-common/src/main/java/org/apache/hudi/io/storage/HoodieHBaseKVComparator.java @@ -19,11 +19,11 @@ package org.apache.hudi.io.storage; -import org.apache.hadoop.hbase.KeyValue; +import org.apache.hadoop.hbase.CellComparatorImpl; /** * This class is explicitly used as Key Comparator to work around the hard coded * legacy format class names inside HBase. Otherwise, we will face issues with shading. */ -public class HoodieHBaseKVComparator extends KeyValue.KVComparator { +public class HoodieHBaseKVComparator extends CellComparatorImpl { } diff --git a/hudi-common/src/main/java/org/apache/hudi/io/storage/HoodieHFileReader.java b/hudi-common/src/main/java/org/apache/hudi/io/storage/HoodieHFileReader.java index 371da7675e99..5c861c9cc7a2 100644 --- a/hudi-common/src/main/java/org/apache/hudi/io/storage/HoodieHFileReader.java +++ b/hudi-common/src/main/java/org/apache/hudi/io/storage/HoodieHFileReader.java @@ -18,18 +18,16 @@ package org.apache.hudi.io.storage; -import java.io.IOException; -import java.nio.ByteBuffer; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.HashMap; -import java.util.Iterator; -import java.util.LinkedList; -import java.util.List; -import java.util.Map; -import java.util.Set; -import java.util.TreeSet; -import java.util.stream.Collectors; +import org.apache.hudi.avro.HoodieAvroUtils; +import org.apache.hudi.common.bloom.BloomFilter; +import org.apache.hudi.common.bloom.BloomFilterFactory; +import org.apache.hudi.common.fs.FSUtils; +import org.apache.hudi.common.util.ClosableIterator; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.common.util.ValidationUtils; +import org.apache.hudi.common.util.io.ByteBufferBackedInputStream; +import org.apache.hudi.exception.HoodieException; +import org.apache.hudi.exception.HoodieIOException; import org.apache.avro.Schema; import org.apache.avro.generic.IndexedRecord; @@ -41,24 +39,31 @@ import org.apache.hadoop.fs.Seekable; import org.apache.hadoop.hbase.Cell; import org.apache.hadoop.hbase.KeyValue; +import org.apache.hadoop.hbase.fs.HFileSystem; import org.apache.hadoop.hbase.io.FSDataInputStreamWrapper; import org.apache.hadoop.hbase.io.hfile.CacheConfig; import org.apache.hadoop.hbase.io.hfile.HFile; +import org.apache.hadoop.hbase.io.hfile.HFileInfo; import org.apache.hadoop.hbase.io.hfile.HFileScanner; +import org.apache.hadoop.hbase.io.hfile.ReaderContext; +import org.apache.hadoop.hbase.io.hfile.ReaderContextBuilder; +import org.apache.hadoop.hbase.nio.ByteBuff; import org.apache.hadoop.hbase.util.Pair; -import org.apache.hudi.avro.HoodieAvroUtils; -import org.apache.hudi.common.bloom.BloomFilter; -import org.apache.hudi.common.bloom.BloomFilterFactory; -import org.apache.hudi.common.fs.FSUtils; -import org.apache.hudi.common.util.ClosableIterator; -import org.apache.hudi.common.util.Option; -import org.apache.hudi.common.util.ValidationUtils; -import org.apache.hudi.common.util.io.ByteBufferBackedInputStream; -import org.apache.hudi.exception.HoodieException; -import org.apache.hudi.exception.HoodieIOException; import org.apache.log4j.LogManager; import org.apache.log4j.Logger; +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.HashMap; +import java.util.Iterator; +import java.util.LinkedList; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.TreeSet; +import java.util.stream.Collectors; + public class HoodieHFileReader implements HoodieFileReader { private static final Logger LOG = LogManager.getLogger(HoodieHFileReader.class); private Path path; @@ -80,14 +85,14 @@ public class HoodieHFileReader implements HoodieFileRea public HoodieHFileReader(Configuration configuration, Path path, CacheConfig cacheConfig) throws IOException { this.conf = configuration; this.path = path; - this.reader = HFile.createReader(FSUtils.getFs(path.toString(), configuration), path, cacheConfig, conf); + this.reader = HFile.createReader(FSUtils.getFs(path.toString(), configuration), path, cacheConfig, true, conf); } public HoodieHFileReader(Configuration configuration, Path path, CacheConfig cacheConfig, FileSystem fs) throws IOException { this.conf = configuration; this.path = path; this.fsDataInputStream = fs.open(path); - this.reader = HFile.createReader(fs, path, cacheConfig, configuration); + this.reader = HFile.createReader(fs, path, cacheConfig, true, configuration); } public HoodieHFileReader(byte[] content) throws IOException { @@ -95,30 +100,34 @@ public HoodieHFileReader(byte[] content) throws IOException { Path path = new Path("hoodie"); SeekableByteArrayInputStream bis = new SeekableByteArrayInputStream(content); FSDataInputStream fsdis = new FSDataInputStream(bis); - this.reader = HFile.createReader(FSUtils.getFs("hoodie", conf), path, new FSDataInputStreamWrapper(fsdis), - content.length, new CacheConfig(conf), conf); + FSDataInputStreamWrapper stream = new FSDataInputStreamWrapper(fsdis); + FileSystem fs = FSUtils.getFs("hoodie", conf); + HFileSystem hfs = (fs instanceof HFileSystem) ? (HFileSystem) fs : new HFileSystem(fs); + ReaderContext context = new ReaderContextBuilder() + .withFilePath(path) + .withInputStreamWrapper(stream) + .withFileSize(content.length) + .withFileSystem(hfs) + .withPrimaryReplicaReader(true) + .withReaderType(ReaderContext.ReaderType.STREAM) + .build(); + HFileInfo fileInfo = new HFileInfo(context, conf); + this.reader = HFile.createReader(context, fileInfo, new CacheConfig(conf), conf); + fileInfo.initMetaAndIndex(reader); } @Override public String[] readMinMaxRecordKeys() { - try { - Map fileInfo = reader.loadFileInfo(); - return new String[] { new String(fileInfo.get(KEY_MIN_RECORD.getBytes())), - new String(fileInfo.get(KEY_MAX_RECORD.getBytes()))}; - } catch (IOException e) { - throw new HoodieException("Could not read min/max record key out of file information block correctly from path", e); - } + HFileInfo fileInfo = reader.getHFileInfo(); + return new String[] { new String(fileInfo.get(KEY_MIN_RECORD.getBytes())), + new String(fileInfo.get(KEY_MAX_RECORD.getBytes()))}; } @Override public Schema getSchema() { if (schema == null) { - try { - Map fileInfo = reader.loadFileInfo(); - schema = new Schema.Parser().parse(new String(fileInfo.get(KEY_SCHEMA.getBytes()))); - } catch (IOException e) { - throw new HoodieException("Could not read schema of file from path", e); - } + HFileInfo fileInfo = reader.getHFileInfo(); + schema = new Schema.Parser().parse(new String(fileInfo.get(KEY_SCHEMA.getBytes()))); } return schema; @@ -133,10 +142,10 @@ public void withSchema(Schema schema) { @Override public BloomFilter readBloomFilter() { - Map fileInfo; + HFileInfo fileInfo; try { - fileInfo = reader.loadFileInfo(); - ByteBuffer serializedFilter = reader.getMetaBlock(KEY_BLOOM_FILTER_META_BLOCK, false); + fileInfo = reader.getHFileInfo(); + ByteBuff serializedFilter = reader.getMetaBlock(KEY_BLOOM_FILTER_META_BLOCK, false).getBufferWithoutHeader(); byte[] filterBytes = new byte[serializedFilter.remaining()]; serializedFilter.get(filterBytes); // read the bytes that were written return BloomFilterFactory.fromString(new String(filterBytes), @@ -206,7 +215,7 @@ private List> readAllRecords(Schema writerSchema, Schema readerS final HFileScanner scanner = reader.getScanner(false, false); if (scanner.seekTo()) { do { - Cell c = scanner.getKeyValue(); + Cell c = scanner.getCell(); final Pair keyAndRecordPair = getRecordFromCell(c, writerSchema, readerSchema, keyFieldSchema); recordList.add(keyAndRecordPair); } while (scanner.next()); @@ -250,7 +259,7 @@ public List> readRecords(List keys) throws IOException { */ public List> readRecords(List keys, Schema schema) throws IOException { this.schema = schema; - reader.loadFileInfo(); + reader.getHFileInfo(); List> records = new ArrayList<>(); for (String key: keys) { Option value = getRecordByKey(key, schema); @@ -263,7 +272,7 @@ public List> readRecords(List keys, Schema schema) throw public ClosableIterator getRecordIterator(List keys, Schema schema) throws IOException { this.schema = schema; - reader.loadFileInfo(); + reader.getHFileInfo(); Iterator iterator = keys.iterator(); return new ClosableIterator() { private R next; @@ -310,7 +319,7 @@ public boolean hasNext() { // To handle when hasNext() is called multiple times for idempotency and/or the first time if (this.next == null && !this.eof) { if (!scanner.isSeeked() && scanner.seekTo()) { - final Pair keyAndRecordPair = getRecordFromCell(scanner.getKeyValue(), getSchema(), readerSchema, keyFieldSchema); + final Pair keyAndRecordPair = getRecordFromCell(scanner.getCell(), getSchema(), readerSchema, keyFieldSchema); this.next = keyAndRecordPair.getSecond(); } } @@ -331,7 +340,7 @@ public R next() { } R retVal = this.next; if (scanner.next()) { - final Pair keyAndRecordPair = getRecordFromCell(scanner.getKeyValue(), getSchema(), readerSchema, keyFieldSchema); + final Pair keyAndRecordPair = getRecordFromCell(scanner.getCell(), getSchema(), readerSchema, keyFieldSchema); this.next = keyAndRecordPair.getSecond(); } else { this.next = null; @@ -371,7 +380,7 @@ public Option getRecordByKey(String key, Schema readerSchema) throws IOException } if (keyScanner.seekTo(kv) == 0) { - Cell c = keyScanner.getKeyValue(); + Cell c = keyScanner.getCell(); // Extract the byte value before releasing the lock since we cannot hold on to the returned cell afterwards value = Arrays.copyOfRange(c.getValueArray(), c.getValueOffset(), c.getValueOffset() + c.getValueLength()); } diff --git a/hudi-common/src/main/resources/hbase-site.xml b/hudi-common/src/main/resources/hbase-site.xml new file mode 100644 index 000000000000..ad680e6b8999 --- /dev/null +++ b/hudi-common/src/main/resources/hbase-site.xml @@ -0,0 +1,2185 @@ + + + + + + + + + + + + hbase.tmp.dir + ${java.io.tmpdir}/hbase-${user.name} + Temporary directory on the local filesystem. + Change this setting to point to a location more permanent + than '/tmp', the usual resolve for java.io.tmpdir, as the + '/tmp' directory is cleared on machine restart. + + + + hbase.rootdir + ${hbase.tmp.dir}/hbase + The directory shared by region servers and into + which HBase persists. The URL should be 'fully-qualified' + to include the filesystem scheme. For example, to specify the + HDFS directory '/hbase' where the HDFS instance's namenode is + running at namenode.example.org on port 9000, set this value to: + hdfs://namenode.example.org:9000/hbase. By default, we write + to whatever ${hbase.tmp.dir} is set too -- usually /tmp -- + so change this configuration or else all data will be lost on + machine restart. + + + + hbase.cluster.distributed + false + The mode the cluster will be in. Possible values are + false for standalone mode and true for distributed mode. If + false, startup will run all HBase and ZooKeeper daemons together + in the one JVM. + + + + hbase.zookeeper.quorum + + 127.0.0.1 + Comma separated list of servers in the ZooKeeper ensemble + (This config. should have been named hbase.zookeeper.ensemble). + For example, "host1.mydomain.com,host2.mydomain.com,host3.mydomain.com". + By default this is set to localhost for local and pseudo-distributed modes + of operation. For a fully-distributed setup, this should be set to a full + list of ZooKeeper ensemble servers. If HBASE_MANAGES_ZK is set in hbase-env.sh + this is the list of servers which hbase will start/stop ZooKeeper on as + part of cluster start/stop. Client-side, we will take this list of + ensemble members and put it together with the hbase.zookeeper.property.clientPort + config. and pass it into zookeeper constructor as the connectString + parameter. + + + + + + zookeeper.recovery.retry.maxsleeptime + 60000 + Max sleep time before retry zookeeper operations in milliseconds, + a max time is needed here so that sleep time won't grow unboundedly + + + + hbase.local.dir + ${hbase.tmp.dir}/local/ + Directory on the local filesystem to be used + as a local storage. + + + + + + hbase.master.port + 16000 + The port the HBase Master should bind to. + + + hbase.master.info.port + 16010 + The port for the HBase Master web UI. + Set to -1 if you do not want a UI instance run. + + + + hbase.master.info.bindAddress + 0.0.0.0 + The bind address for the HBase Master web UI + + + + hbase.master.logcleaner.plugins + + org.apache.hadoop.hbase.master.cleaner.TimeToLiveLogCleaner,org.apache.hadoop.hbase.master.cleaner.TimeToLiveProcedureWALCleaner,org.apache.hadoop.hbase.master.cleaner.TimeToLiveMasterLocalStoreWALCleaner + + A comma-separated list of BaseLogCleanerDelegate invoked by + the LogsCleaner service. These WAL cleaners are called in order, + so put the cleaner that prunes the most files in front. To + implement your own BaseLogCleanerDelegate, just put it in HBase's classpath + and add the fully qualified class name here. Always add the above + default log cleaners in the list. + + + + hbase.master.logcleaner.ttl + 600000 + How long a WAL remain in the archive ({hbase.rootdir}/oldWALs) directory, + after which it will be cleaned by a Master thread. The value is in milliseconds. + + + + hbase.master.hfilecleaner.plugins + + org.apache.hadoop.hbase.master.cleaner.TimeToLiveHFileCleaner,org.apache.hadoop.hbase.master.cleaner.TimeToLiveMasterLocalStoreHFileCleaner + + A comma-separated list of BaseHFileCleanerDelegate invoked by + the HFileCleaner service. These HFiles cleaners are called in order, + so put the cleaner that prunes the most files in front. To + implement your own BaseHFileCleanerDelegate, just put it in HBase's classpath + and add the fully qualified class name here. Always add the above + default hfile cleaners in the list as they will be overwritten in + hbase-site.xml. + + + + hbase.master.infoserver.redirect + true + Whether or not the Master listens to the Master web + UI port (hbase.master.info.port) and redirects requests to the web + UI server shared by the Master and RegionServer. Config. makes + sense when Master is serving Regions (not the default). + + + + hbase.master.fileSplitTimeout + 600000 + Splitting a region, how long to wait on the file-splitting + step before aborting the attempt. Default: 600000. This setting used + to be known as hbase.regionserver.fileSplitTimeout in hbase-1.x. + Split is now run master-side hence the rename (If a + 'hbase.master.fileSplitTimeout' setting found, will use it to + prime the current 'hbase.master.fileSplitTimeout' + Configuration. + + + + + + hbase.regionserver.port + 16020 + The port the HBase RegionServer binds to. + + + hbase.regionserver.info.port + 16030 + The port for the HBase RegionServer web UI + Set to -1 if you do not want the RegionServer UI to run. + + + + hbase.regionserver.info.bindAddress + 0.0.0.0 + The address for the HBase RegionServer web UI + + + hbase.regionserver.info.port.auto + false + Whether or not the Master or RegionServer + UI should search for a port to bind to. Enables automatic port + search if hbase.regionserver.info.port is already in use. + Useful for testing, turned off by default. + + + + hbase.regionserver.handler.count + 30 + Count of RPC Listener instances spun up on RegionServers. + Same property is used by the Master for count of master handlers. + Too many handlers can be counter-productive. Make it a multiple of + CPU count. If mostly read-only, handlers count close to cpu count + does well. Start with twice the CPU count and tune from there. + + + + hbase.ipc.server.callqueue.handler.factor + 0.1 + Factor to determine the number of call queues. + A value of 0 means a single queue shared between all the handlers. + A value of 1 means that each handler has its own queue. + + + + hbase.ipc.server.callqueue.read.ratio + 0 + Split the call queues into read and write queues. + The specified interval (which should be between 0.0 and 1.0) + will be multiplied by the number of call queues. + A value of 0 indicate to not split the call queues, meaning that both read and write + requests will be pushed to the same set of queues. + A value lower than 0.5 means that there will be less read queues than write queues. + A value of 0.5 means there will be the same number of read and write queues. + A value greater than 0.5 means that there will be more read queues than write queues. + A value of 1.0 means that all the queues except one are used to dispatch read requests. + + Example: Given the total number of call queues being 10 + a read.ratio of 0 means that: the 10 queues will contain both read/write requests. + a read.ratio of 0.3 means that: 3 queues will contain only read requests + and 7 queues will contain only write requests. + a read.ratio of 0.5 means that: 5 queues will contain only read requests + and 5 queues will contain only write requests. + a read.ratio of 0.8 means that: 8 queues will contain only read requests + and 2 queues will contain only write requests. + a read.ratio of 1 means that: 9 queues will contain only read requests + and 1 queues will contain only write requests. + + + + hbase.ipc.server.callqueue.scan.ratio + 0 + Given the number of read call queues, calculated from the total number + of call queues multiplied by the callqueue.read.ratio, the scan.ratio property + will split the read call queues into small-read and long-read queues. + A value lower than 0.5 means that there will be less long-read queues than short-read queues. + A value of 0.5 means that there will be the same number of short-read and long-read queues. + A value greater than 0.5 means that there will be more long-read queues than short-read queues + A value of 0 or 1 indicate to use the same set of queues for gets and scans. + + Example: Given the total number of read call queues being 8 + a scan.ratio of 0 or 1 means that: 8 queues will contain both long and short read requests. + a scan.ratio of 0.3 means that: 2 queues will contain only long-read requests + and 6 queues will contain only short-read requests. + a scan.ratio of 0.5 means that: 4 queues will contain only long-read requests + and 4 queues will contain only short-read requests. + a scan.ratio of 0.8 means that: 6 queues will contain only long-read requests + and 2 queues will contain only short-read requests. + + + + hbase.regionserver.msginterval + 3000 + Interval between messages from the RegionServer to Master + in milliseconds. + + + + hbase.regionserver.logroll.period + 3600000 + Period at which we will roll the commit log regardless + of how many edits it has. + + + + hbase.regionserver.logroll.errors.tolerated + 2 + The number of consecutive WAL close errors we will allow + before triggering a server abort. A setting of 0 will cause the + region server to abort if closing the current WAL writer fails during + log rolling. Even a small value (2 or 3) will allow a region server + to ride over transient HDFS errors. + + + + hbase.regionserver.hlog.reader.impl + org.apache.hadoop.hbase.regionserver.wal.ProtobufLogReader + The WAL file reader implementation. + + + hbase.regionserver.hlog.writer.impl + org.apache.hadoop.hbase.regionserver.wal.ProtobufLogWriter + The WAL file writer implementation. + + + hbase.regionserver.global.memstore.size + + Maximum size of all memstores in a region server before new + updates are blocked and flushes are forced. Defaults to 40% of heap (0.4). + Updates are blocked and flushes are forced until size of all memstores + in a region server hits hbase.regionserver.global.memstore.size.lower.limit. + The default value in this configuration has been intentionally left empty in order to + honor the old hbase.regionserver.global.memstore.upperLimit property if present. + + + + hbase.regionserver.global.memstore.size.lower.limit + + Maximum size of all memstores in a region server before flushes + are forced. Defaults to 95% of hbase.regionserver.global.memstore.size + (0.95). A 100% value for this value causes the minimum possible flushing + to occur when updates are blocked due to memstore limiting. The default + value in this configuration has been intentionally left empty in order to + honor the old hbase.regionserver.global.memstore.lowerLimit property if + present. + + + + hbase.systemtables.compacting.memstore.type + NONE + Determines the type of memstore to be used for system tables like + META, namespace tables etc. By default NONE is the type and hence we use the + default memstore for all the system tables. If we need to use compacting + memstore for system tables then set this property to BASIC/EAGER + + + + hbase.regionserver.optionalcacheflushinterval + 3600000 + + Maximum amount of time an edit lives in memory before being automatically flushed. + Default 1 hour. Set it to 0 to disable automatic flushing. + + + + hbase.regionserver.dns.interface + default + The name of the Network Interface from which a region server + should report its IP address. + + + + hbase.regionserver.dns.nameserver + default + The host name or IP address of the name server (DNS) + which a region server should use to determine the host name used by the + master for communication and display purposes. + + + + hbase.regionserver.region.split.policy + org.apache.hadoop.hbase.regionserver.SteppingSplitPolicy + + A split policy determines when a region should be split. The various + other split policies that are available currently are BusyRegionSplitPolicy, + ConstantSizeRegionSplitPolicy, DisabledRegionSplitPolicy, + DelimitedKeyPrefixRegionSplitPolicy, KeyPrefixRegionSplitPolicy, and + SteppingSplitPolicy. DisabledRegionSplitPolicy blocks manual region splitting. + + + + hbase.regionserver.regionSplitLimit + 1000 + + Limit for the number of regions after which no more region splitting + should take place. This is not hard limit for the number of regions + but acts as a guideline for the regionserver to stop splitting after + a certain limit. Default is set to 1000. + + + + + + zookeeper.session.timeout + 90000 + ZooKeeper session timeout in milliseconds. It is used in two different ways. + First, this value is used in the ZK client that HBase uses to connect to the ensemble. + It is also used by HBase when it starts a ZK server and it is passed as the 'maxSessionTimeout'. + See https://zookeeper.apache.org/doc/current/zookeeperProgrammers.html#ch_zkSessions. + For example, if an HBase region server connects to a ZK ensemble that's also managed + by HBase, then the session timeout will be the one specified by this configuration. + But, a region server that connects to an ensemble managed with a different configuration + will be subjected that ensemble's maxSessionTimeout. So, even though HBase might propose + using 90 seconds, the ensemble can have a max timeout lower than this and it will take + precedence. The current default maxSessionTimeout that ZK ships with is 40 seconds, which is lower than + HBase's. + + + + zookeeper.znode.parent + /hbase + Root ZNode for HBase in ZooKeeper. All of HBase's ZooKeeper + files that are configured with a relative path will go under this node. + By default, all of HBase's ZooKeeper file paths are configured with a + relative path, so they will all go under this directory unless changed. + + + + zookeeper.znode.acl.parent + acl + Root ZNode for access control lists. + + + hbase.zookeeper.dns.interface + default + The name of the Network Interface from which a ZooKeeper server + should report its IP address. + + + + hbase.zookeeper.dns.nameserver + default + The host name or IP address of the name server (DNS) + which a ZooKeeper server should use to determine the host name used by the + master for communication and display purposes. + + + + + hbase.zookeeper.peerport + 2888 + Port used by ZooKeeper peers to talk to each other. + See https://zookeeper.apache.org/doc/r3.3.3/zookeeperStarted.html#sc_RunningReplicatedZooKeeper + for more information. + + + + hbase.zookeeper.leaderport + 3888 + Port used by ZooKeeper for leader election. + See https://zookeeper.apache.org/doc/r3.3.3/zookeeperStarted.html#sc_RunningReplicatedZooKeeper + for more information. + + + + + + + hbase.zookeeper.property.initLimit + 10 + Property from ZooKeeper's config zoo.cfg. + The number of ticks that the initial synchronization phase can take. + + + + hbase.zookeeper.property.syncLimit + 5 + Property from ZooKeeper's config zoo.cfg. + The number of ticks that can pass between sending a request and getting an + acknowledgment. + + + + hbase.zookeeper.property.dataDir + ${hbase.tmp.dir}/zookeeper + Property from ZooKeeper's config zoo.cfg. + The directory where the snapshot is stored. + + + + hbase.zookeeper.property.clientPort + 2181 + Property from ZooKeeper's config zoo.cfg. + The port at which the clients will connect. + + + + hbase.zookeeper.property.maxClientCnxns + 300 + Property from ZooKeeper's config zoo.cfg. + Limit on number of concurrent connections (at the socket level) that a + single client, identified by IP address, may make to a single member of + the ZooKeeper ensemble. Set high to avoid zk connection issues running + standalone and pseudo-distributed. + + + + + + + hbase.client.write.buffer + 2097152 + Default size of the BufferedMutator write buffer in bytes. + A bigger buffer takes more memory -- on both the client and server + side since server instantiates the passed write buffer to process + it -- but a larger buffer size reduces the number of RPCs made. + For an estimate of server-side memory-used, evaluate + hbase.client.write.buffer * hbase.regionserver.handler.count + + + + hbase.client.pause + 100 + General client pause value. Used mostly as value to wait + before running a retry of a failed get, region lookup, etc. + See hbase.client.retries.number for description of how we backoff from + this initial pause amount and how this pause works w/ retries. + + + + hbase.client.pause.cqtbe + + Whether or not to use a special client pause for + CallQueueTooBigException (cqtbe). Set this property to a higher value + than hbase.client.pause if you observe frequent CQTBE from the same + RegionServer and the call queue there keeps full + + + + hbase.client.retries.number + 15 + Maximum retries. Used as maximum for all retryable + operations such as the getting of a cell's value, starting a row update, + etc. Retry interval is a rough function based on hbase.client.pause. At + first we retry at this interval but then with backoff, we pretty quickly reach + retrying every ten seconds. See HConstants#RETRY_BACKOFF for how the backup + ramps up. Change this setting and hbase.client.pause to suit your workload. + + + + hbase.client.max.total.tasks + 100 + The maximum number of concurrent mutation tasks a single HTable instance will + send to the cluster. + + + + hbase.client.max.perserver.tasks + 2 + The maximum number of concurrent mutation tasks a single HTable instance will + send to a single region server. + + + + hbase.client.max.perregion.tasks + 1 + The maximum number of concurrent mutation tasks the client will + maintain to a single Region. That is, if there is already + hbase.client.max.perregion.tasks writes in progress for this region, new puts + won't be sent to this region until some writes finishes. + + + + hbase.client.perserver.requests.threshold + 2147483647 + The max number of concurrent pending requests for one server in all client threads + (process level). Exceeding requests will be thrown ServerTooBusyException immediately to prevent + user's threads being occupied and blocked by only one slow region server. If you use a fix + number of threads to access HBase in a synchronous way, set this to a suitable value which is + related to the number of threads will help you. See + https://issues.apache.org/jira/browse/HBASE-16388 for details. + + + + hbase.client.scanner.caching + 2147483647 + Number of rows that we try to fetch when calling next + on a scanner if it is not served from (local, client) memory. This configuration + works together with hbase.client.scanner.max.result.size to try and use the + network efficiently. The default value is Integer.MAX_VALUE by default so that + the network will fill the chunk size defined by hbase.client.scanner.max.result.size + rather than be limited by a particular number of rows since the size of rows varies + table to table. If you know ahead of time that you will not require more than a certain + number of rows from a scan, this configuration should be set to that row limit via + Scan#setCaching. Higher caching values will enable faster scanners but will eat up more + memory and some calls of next may take longer and longer times when the cache is empty. + Do not set this value such that the time between invocations is greater than the scanner + timeout; i.e. hbase.client.scanner.timeout.period + + + + hbase.client.keyvalue.maxsize + 10485760 + Specifies the combined maximum allowed size of a KeyValue + instance. This is to set an upper boundary for a single entry saved in a + storage file. Since they cannot be split it helps avoiding that a region + cannot be split any further because the data is too large. It seems wise + to set this to a fraction of the maximum region size. Setting it to zero + or less disables the check. + + + + hbase.server.keyvalue.maxsize + 10485760 + Maximum allowed size of an individual cell, inclusive of value and all key + components. A value of 0 or less disables the check. + The default value is 10MB. + This is a safety setting to protect the server from OOM situations. + + + + hbase.client.scanner.timeout.period + 60000 + Client scanner lease period in milliseconds. + + + hbase.client.localityCheck.threadPoolSize + 2 + + + + + hbase.bulkload.retries.number + 10 + Maximum retries. This is maximum number of iterations + to atomic bulk loads are attempted in the face of splitting operations + 0 means never give up. + + + + hbase.master.balancer.maxRitPercent + 1.0 + The max percent of regions in transition when balancing. + The default value is 1.0. So there are no balancer throttling. If set this config to 0.01, + It means that there are at most 1% regions in transition when balancing. + Then the cluster's availability is at least 99% when balancing. + + + + hbase.balancer.period + + 300000 + Period at which the region balancer runs in the Master, in + milliseconds. + + + + hbase.regions.slop + 0.001 + Rebalance if any regionserver has average + (average * slop) regions. + The default value of this parameter is 0.001 in StochasticLoadBalancer (the default load + balancer), while the default is 0.2 in other load balancers (i.e., + SimpleLoadBalancer). + + + + hbase.normalizer.period + 300000 + Period at which the region normalizer runs in the Master, in + milliseconds. + + + + hbase.normalizer.split.enabled + true + Whether to split a region as part of normalization. + + + hbase.normalizer.merge.enabled + true + Whether to merge a region as part of normalization. + + + hbase.normalizer.min.region.count + 3 + The minimum number of regions in a table to consider it for merge + normalization. + + + + hbase.normalizer.merge.min_region_age.days + 3 + The minimum age for a region to be considered for a merge, in days. + + + hbase.normalizer.merge.min_region_age.days + 3 + The minimum age for a region to be considered for a merge, in days. + + + hbase.normalizer.merge.min_region_size.mb + 1 + The minimum size for a region to be considered for a merge, in whole + MBs. + + + + hbase.table.normalization.enabled + false + This config is used to set default behaviour of normalizer at table level. + To override this at table level one can set NORMALIZATION_ENABLED at table descriptor level + and that property will be honored + + + + hbase.server.thread.wakefrequency + 10000 + Time to sleep in between searches for work (in milliseconds). + Used as sleep interval by service threads such as log roller. + + + + hbase.server.versionfile.writeattempts + 3 + + How many times to retry attempting to write a version file + before just aborting. Each attempt is separated by the + hbase.server.thread.wakefrequency milliseconds. + + + + hbase.hregion.memstore.flush.size + 134217728 + + Memstore will be flushed to disk if size of the memstore + exceeds this number of bytes. Value is checked by a thread that runs + every hbase.server.thread.wakefrequency. + + + + hbase.hregion.percolumnfamilyflush.size.lower.bound.min + 16777216 + + If FlushLargeStoresPolicy is used and there are multiple column families, + then every time that we hit the total memstore limit, we find out all the + column families whose memstores exceed a "lower bound" and only flush them + while retaining the others in memory. The "lower bound" will be + "hbase.hregion.memstore.flush.size / column_family_number" by default + unless value of this property is larger than that. If none of the families + have their memstore size more than lower bound, all the memstores will be + flushed (just as usual). + + + + hbase.hregion.preclose.flush.size + 5242880 + + If the memstores in a region are this size or larger when we go + to close, run a "pre-flush" to clear out memstores before we put up + the region closed flag and take the region offline. On close, + a flush is run under the close flag to empty memory. During + this time the region is offline and we are not taking on any writes. + If the memstore content is large, this flush could take a long time to + complete. The preflush is meant to clean out the bulk of the memstore + before putting up the close flag and taking the region offline so the + flush that runs under the close flag has little to do. + + + + hbase.hregion.memstore.block.multiplier + 4 + + Block updates if memstore has hbase.hregion.memstore.block.multiplier + times hbase.hregion.memstore.flush.size bytes. Useful preventing + runaway memstore during spikes in update traffic. Without an + upper-bound, memstore fills such that when it flushes the + resultant flush files take a long time to compact or split, or + worse, we OOME. + + + + hbase.hregion.memstore.mslab.enabled + true + + Enables the MemStore-Local Allocation Buffer, + a feature which works to prevent heap fragmentation under + heavy write loads. This can reduce the frequency of stop-the-world + GC pauses on large heaps. + + + + hbase.hregion.memstore.mslab.chunksize + 2097152 + The maximum byte size of a chunk in the MemStoreLAB. Unit: bytes + + + hbase.regionserver.offheap.global.memstore.size + 0 + The amount of off-heap memory all MemStores in a RegionServer may use. + A value of 0 means that no off-heap memory will be used and all chunks in MSLAB + will be HeapByteBuffer, otherwise the non-zero value means how many megabyte of + off-heap memory will be used for chunks in MSLAB and all chunks in MSLAB will be + DirectByteBuffer. Unit: megabytes. + + + + hbase.hregion.memstore.mslab.max.allocation + 262144 + The maximal size of one allocation in the MemStoreLAB, if the desired byte + size exceed this threshold then it will be just allocated from JVM heap rather than MemStoreLAB. + + + + hbase.hregion.max.filesize + 10737418240 + + Maximum HFile size. If the sum of the sizes of a region's HFiles has grown to exceed this + value, the region is split in two. + + + + hbase.hregion.split.overallfiles + false + If we should sum overall region files size when check to split. + + + hbase.hregion.majorcompaction + 604800000 + Time between major compactions, expressed in milliseconds. Set to 0 to disable + time-based automatic major compactions. User-requested and size-based major compactions will + still run. This value is multiplied by hbase.hregion.majorcompaction.jitter to cause + compaction to start at a somewhat-random time during a given window of time. The default value + is 7 days, expressed in milliseconds. If major compactions are causing disruption in your + environment, you can configure them to run at off-peak times for your deployment, or disable + time-based major compactions by setting this parameter to 0, and run major compactions in a + cron job or by another external mechanism. + + + + hbase.hregion.majorcompaction.jitter + 0.50 + A multiplier applied to hbase.hregion.majorcompaction to cause compaction to occur + a given amount of time either side of hbase.hregion.majorcompaction. The smaller the number, + the closer the compactions will happen to the hbase.hregion.majorcompaction + interval. + + + + hbase.hstore.compactionThreshold + 3 + If more than this number of StoreFiles exist in any one Store + (one StoreFile is written per flush of MemStore), a compaction is run to rewrite all + StoreFiles into a single StoreFile. Larger values delay compaction, but when compaction does + occur, it takes longer to complete. + + + + hbase.regionserver.compaction.enabled + true + Enable/disable compactions on by setting true/false. + We can further switch compactions dynamically with the + compaction_switch shell command. + + + + hbase.hstore.flusher.count + 2 + The number of flush threads. With fewer threads, the MemStore flushes will be + queued. With more threads, the flushes will be executed in parallel, increasing the load on + HDFS, and potentially causing more compactions. + + + + hbase.hstore.blockingStoreFiles + 16 + If more than this number of StoreFiles exist in any one Store (one StoreFile + is written per flush of MemStore), updates are blocked for this region until a compaction is + completed, or until hbase.hstore.blockingWaitTime has been exceeded. + + + + hbase.hstore.blockingWaitTime + 90000 + The time for which a region will block updates after reaching the StoreFile limit + defined by hbase.hstore.blockingStoreFiles. After this time has elapsed, the region will stop + blocking updates even if a compaction has not been completed. + + + + hbase.hstore.compaction.min + + The minimum number of StoreFiles which must be eligible for compaction before + compaction can run. The goal of tuning hbase.hstore.compaction.min is to avoid ending up with + too many tiny StoreFiles to compact. Setting this value to 2 would cause a minor compaction + each time you have two StoreFiles in a Store, and this is probably not appropriate. If you + set this value too high, all the other values will need to be adjusted accordingly. For most + cases, the default value is appropriate (empty value here, results in 3 by code logic). In + previous versions of HBase, the parameter hbase.hstore.compaction.min was named + hbase.hstore.compactionThreshold. + + + + hbase.hstore.compaction.max + 10 + The maximum number of StoreFiles which will be selected for a single minor + compaction, regardless of the number of eligible StoreFiles. Effectively, the value of + hbase.hstore.compaction.max controls the length of time it takes a single compaction to + complete. Setting it larger means that more StoreFiles are included in a compaction. For most + cases, the default value is appropriate. + + + + hbase.hstore.compaction.min.size + 134217728 + A StoreFile (or a selection of StoreFiles, when using ExploringCompactionPolicy) + smaller than this size will always be eligible for minor compaction. + HFiles this size or larger are evaluated by hbase.hstore.compaction.ratio to determine if + they are eligible. Because this limit represents the "automatic include" limit for all + StoreFiles smaller than this value, this value may need to be reduced in write-heavy + environments where many StoreFiles in the 1-2 MB range are being flushed, because every + StoreFile will be targeted for compaction and the resulting StoreFiles may still be under the + minimum size and require further compaction. If this parameter is lowered, the ratio check is + triggered more quickly. This addressed some issues seen in earlier versions of HBase but + changing this parameter is no longer necessary in most situations. Default: 128 MB expressed + in bytes. + + + + hbase.hstore.compaction.max.size + 9223372036854775807 + A StoreFile (or a selection of StoreFiles, when using ExploringCompactionPolicy) + larger than this size will be excluded from compaction. The effect of + raising hbase.hstore.compaction.max.size is fewer, larger StoreFiles that do not get + compacted often. If you feel that compaction is happening too often without much benefit, you + can try raising this value. Default: the value of LONG.MAX_VALUE, expressed in bytes. + + + + hbase.hstore.compaction.ratio + 1.2F + For minor compaction, this ratio is used to determine whether a given StoreFile + which is larger than hbase.hstore.compaction.min.size is eligible for compaction. Its + effect is to limit compaction of large StoreFiles. The value of hbase.hstore.compaction.ratio + is expressed as a floating-point decimal. A large ratio, such as 10, will produce a single + giant StoreFile. Conversely, a low value, such as .25, will produce behavior similar to the + BigTable compaction algorithm, producing four StoreFiles. A moderate value of between 1.0 and + 1.4 is recommended. When tuning this value, you are balancing write costs with read costs. + Raising the value (to something like 1.4) will have more write costs, because you will + compact larger StoreFiles. However, during reads, HBase will need to seek through fewer + StoreFiles to accomplish the read. Consider this approach if you cannot take advantage of + Bloom filters. Otherwise, you can lower this value to something like 1.0 to reduce the + background cost of writes, and use Bloom filters to control the number of StoreFiles touched + during reads. For most cases, the default value is appropriate. + + + + hbase.hstore.compaction.ratio.offpeak + 5.0F + Allows you to set a different (by default, more aggressive) ratio for determining + whether larger StoreFiles are included in compactions during off-peak hours. Works in the + same way as hbase.hstore.compaction.ratio. Only applies if hbase.offpeak.start.hour and + hbase.offpeak.end.hour are also enabled. + + + + hbase.hstore.time.to.purge.deletes + 0 + The amount of time to delay purging of delete markers with future timestamps. If + unset, or set to 0, all delete markers, including those with future timestamps, are purged + during the next major compaction. Otherwise, a delete marker is kept until the major compaction + which occurs after the marker's timestamp plus the value of this setting, in milliseconds. + + + + hbase.offpeak.start.hour + -1 + The start of off-peak hours, expressed as an integer between 0 and 23, inclusive. + Set to -1 to disable off-peak. + + + + hbase.offpeak.end.hour + -1 + The end of off-peak hours, expressed as an integer between 0 and 23, inclusive. Set + to -1 to disable off-peak. + + + + hbase.regionserver.thread.compaction.throttle + 2684354560 + There are two different thread pools for compactions, one for large compactions and + the other for small compactions. This helps to keep compaction of lean tables (such as + hbase:meta) fast. If a compaction is larger than this threshold, it + goes into the large compaction pool. In most cases, the default value is appropriate. Default: + 2 x hbase.hstore.compaction.max x hbase.hregion.memstore.flush.size (which defaults to 128MB). + The value field assumes that the value of hbase.hregion.memstore.flush.size is unchanged from + the default. + + + + hbase.regionserver.majorcompaction.pagecache.drop + true + Specifies whether to drop pages read/written into the system page cache by + major compactions. Setting it to true helps prevent major compactions from + polluting the page cache, which is almost always required, especially for clusters + with low/moderate memory to storage ratio. + + + + hbase.regionserver.minorcompaction.pagecache.drop + true + Specifies whether to drop pages read/written into the system page cache by + minor compactions. Setting it to true helps prevent minor compactions from + polluting the page cache, which is most beneficial on clusters with low + memory to storage ratio or very write heavy clusters. You may want to set it to + false under moderate to low write workload when bulk of the reads are + on the most recently written data. + + + + hbase.hstore.compaction.kv.max + 10 + The maximum number of KeyValues to read and then write in a batch when flushing or + compacting. Set this lower if you have big KeyValues and problems with Out Of Memory + Exceptions Set this higher if you have wide, small rows. + + + + hbase.storescanner.parallel.seek.enable + false + + Enables StoreFileScanner parallel-seeking in StoreScanner, + a feature which can reduce response latency under special conditions. + + + + hbase.storescanner.parallel.seek.threads + 10 + + The default thread pool size if parallel-seeking feature enabled. + + + + hfile.block.cache.policy + LRU + The eviction policy for the L1 block cache (LRU or TinyLFU). + + + hfile.block.cache.size + 0.4 + Percentage of maximum heap (-Xmx setting) to allocate to block cache + used by a StoreFile. Default of 0.4 means allocate 40%. + Set to 0 to disable but it's not recommended; you need at least + enough cache to hold the storefile indices. + + + + hfile.block.index.cacheonwrite + false + This allows to put non-root multi-level index blocks into the block + cache at the time the index is being written. + + + + hfile.index.block.max.size + 131072 + When the size of a leaf-level, intermediate-level, or root-level + index block in a multi-level block index grows to this size, the + block is written out and a new block is started. + + + + hbase.bucketcache.ioengine + + Where to store the contents of the bucketcache. One of: offheap, + file, files, mmap or pmem. If a file or files, set it to file(s):PATH_TO_FILE. + mmap means the content will be in an mmaped file. Use mmap:PATH_TO_FILE. 'pmem' + is bucket cache over a file on the persistent memory device. + Use pmem:PATH_TO_FILE. + See http://hbase.apache.org/book.html#offheap.blockcache for more information. + + + + hbase.hstore.compaction.throughput.lower.bound + 52428800 + The target lower bound on aggregate compaction throughput, in bytes/sec. Allows + you to tune the minimum available compaction throughput when the + PressureAwareCompactionThroughputController throughput controller is active. (It is active by + default.) + + + + hbase.hstore.compaction.throughput.higher.bound + 104857600 + The target upper bound on aggregate compaction throughput, in bytes/sec. Allows + you to control aggregate compaction throughput demand when the + PressureAwareCompactionThroughputController throughput controller is active. (It is active by + default.) The maximum throughput will be tuned between the lower and upper bounds when + compaction pressure is within the range [0.0, 1.0]. If compaction pressure is 1.0 or greater + the higher bound will be ignored until pressure returns to the normal range. + + + + hbase.bucketcache.size + + A float that EITHER represents a percentage of total heap memory + size to give to the cache (if < 1.0) OR, it is the total capacity in + megabytes of BucketCache. Default: 0.0 + + + + hbase.bucketcache.bucket.sizes + + A comma-separated list of sizes for buckets for the bucketcache. + Can be multiple sizes. List block sizes in order from smallest to largest. + The sizes you use will depend on your data access patterns. + Must be a multiple of 256 else you will run into + 'java.io.IOException: Invalid HFile block magic' when you go to read from cache. + If you specify no values here, then you pick up the default bucketsizes set + in code (See BucketAllocator#DEFAULT_BUCKET_SIZES). + + + + hfile.format.version + 3 + The HFile format version to use for new files. + Version 3 adds support for tags in hfiles (See http://hbase.apache.org/book.html#hbase.tags). + Also see the configuration 'hbase.replication.rpc.codec'. + + + + hfile.block.bloom.cacheonwrite + false + Enables cache-on-write for inline blocks of a compound Bloom filter. + + + io.storefile.bloom.block.size + 131072 + The size in bytes of a single block ("chunk") of a compound Bloom + filter. This size is approximate, because Bloom blocks can only be + inserted at data block boundaries, and the number of keys per data + block varies. + + + + hbase.rs.cacheblocksonwrite + false + Whether an HFile block should be added to the block cache when the + block is finished. + + + + hbase.rpc.timeout + 60000 + This is for the RPC layer to define how long (millisecond) HBase client applications + take for a remote call to time out. It uses pings to check connections + but will eventually throw a TimeoutException. + + + + hbase.client.operation.timeout + 1200000 + Operation timeout is a top-level restriction (millisecond) that makes sure a + blocking operation in Table will not be blocked more than this. In each operation, if rpc + request fails because of timeout or other reason, it will retry until success or throw + RetriesExhaustedException. But if the total time being blocking reach the operation timeout + before retries exhausted, it will break early and throw SocketTimeoutException. + + + + hbase.cells.scanned.per.heartbeat.check + 10000 + The number of cells scanned in between heartbeat checks. Heartbeat + checks occur during the processing of scans to determine whether or not the + server should stop scanning in order to send back a heartbeat message to the + client. Heartbeat messages are used to keep the client-server connection alive + during long running scans. Small values mean that the heartbeat checks will + occur more often and thus will provide a tighter bound on the execution time of + the scan. Larger values mean that the heartbeat checks occur less frequently + + + + hbase.rpc.shortoperation.timeout + 10000 + This is another version of "hbase.rpc.timeout". For those RPC operation + within cluster, we rely on this configuration to set a short timeout limitation + for short operation. For example, short rpc timeout for region server's trying + to report to active master can benefit quicker master failover process. + + + + hbase.ipc.client.tcpnodelay + true + Set no delay on rpc socket connections. See + http://docs.oracle.com/javase/1.5.0/docs/api/java/net/Socket.html#getTcpNoDelay() + + + + hbase.unsafe.regionserver.hostname + + This config is for experts: don't set its value unless you really know what you are doing. + When set to a non-empty value, this represents the (external facing) hostname for the underlying server. + See https://issues.apache.org/jira/browse/HBASE-12954 for details. + + + + hbase.unsafe.regionserver.hostname.disable.master.reversedns + false + This config is for experts: don't set its value unless you really know what you are doing. + When set to true, regionserver will use the current node hostname for the servername and HMaster will + skip reverse DNS lookup and use the hostname sent by regionserver instead. Note that this config and + hbase.unsafe.regionserver.hostname are mutually exclusive. See https://issues.apache.org/jira/browse/HBASE-18226 + for more details. + + + + + hbase.master.keytab.file + + Full path to the kerberos keytab file to use for logging in + the configured HMaster server principal. + + + + hbase.master.kerberos.principal + + Ex. "hbase/_HOST@EXAMPLE.COM". The kerberos principal name + that should be used to run the HMaster process. The principal name should + be in the form: user/hostname@DOMAIN. If "_HOST" is used as the hostname + portion, it will be replaced with the actual hostname of the running + instance. + + + + hbase.regionserver.keytab.file + + Full path to the kerberos keytab file to use for logging in + the configured HRegionServer server principal. + + + + hbase.regionserver.kerberos.principal + + Ex. "hbase/_HOST@EXAMPLE.COM". The kerberos principal name + that should be used to run the HRegionServer process. The principal name + should be in the form: user/hostname@DOMAIN. If "_HOST" is used as the + hostname portion, it will be replaced with the actual hostname of the + running instance. An entry for this principal must exist in the file + specified in hbase.regionserver.keytab.file + + + + + hadoop.policy.file + hbase-policy.xml + The policy configuration file used by RPC servers to make + authorization decisions on client requests. Only used when HBase + security is enabled. + + + + hbase.superuser + + List of users or groups (comma-separated), who are allowed + full privileges, regardless of stored ACLs, across the cluster. + Only used when HBase security is enabled. + + + + hbase.auth.key.update.interval + 86400000 + The update interval for master key for authentication tokens + in servers in milliseconds. Only used when HBase security is enabled. + + + + hbase.auth.token.max.lifetime + 604800000 + The maximum lifetime in milliseconds after which an + authentication token expires. Only used when HBase security is enabled. + + + + hbase.ipc.client.fallback-to-simple-auth-allowed + false + When a client is configured to attempt a secure connection, but attempts to + connect to an insecure server, that server may instruct the client to + switch to SASL SIMPLE (unsecure) authentication. This setting controls + whether or not the client will accept this instruction from the server. + When false (the default), the client will not allow the fallback to SIMPLE + authentication, and will abort the connection. + + + + hbase.ipc.server.fallback-to-simple-auth-allowed + false + When a server is configured to require secure connections, it will + reject connection attempts from clients using SASL SIMPLE (unsecure) authentication. + This setting allows secure servers to accept SASL SIMPLE connections from clients + when the client requests. When false (the default), the server will not allow the fallback + to SIMPLE authentication, and will reject the connection. WARNING: This setting should ONLY + be used as a temporary measure while converting clients over to secure authentication. It + MUST BE DISABLED for secure operation. + + + + hbase.display.keys + true + When this is set to true the webUI and such will display all start/end keys + as part of the table details, region names, etc. When this is set to false, + the keys are hidden. + + + + hbase.coprocessor.enabled + true + Enables or disables coprocessor loading. If 'false' + (disabled), any other coprocessor related configuration will be ignored. + + + + hbase.coprocessor.user.enabled + true + Enables or disables user (aka. table) coprocessor loading. + If 'false' (disabled), any table coprocessor attributes in table + descriptors will be ignored. If "hbase.coprocessor.enabled" is 'false' + this setting has no effect. + + + + hbase.coprocessor.region.classes + + A comma-separated list of Coprocessors that are loaded by + default on all tables. For any override coprocessor method, these classes + will be called in order. After implementing your own Coprocessor, just put + it in HBase's classpath and add the fully qualified class name here. + A coprocessor can also be loaded on demand by setting HTableDescriptor. + + + + hbase.coprocessor.master.classes + + A comma-separated list of + org.apache.hadoop.hbase.coprocessor.MasterObserver coprocessors that are + loaded by default on the active HMaster process. For any implemented + coprocessor methods, the listed classes will be called in order. After + implementing your own MasterObserver, just put it in HBase's classpath + and add the fully qualified class name here. + + + + hbase.coprocessor.abortonerror + true + Set to true to cause the hosting server (master or regionserver) + to abort if a coprocessor fails to load, fails to initialize, or throws an + unexpected Throwable object. Setting this to false will allow the server to + continue execution but the system wide state of the coprocessor in question + will become inconsistent as it will be properly executing in only a subset + of servers, so this is most useful for debugging only. + + + + hbase.rest.port + 8080 + The port for the HBase REST server. + + + hbase.rest.readonly + false + Defines the mode the REST server will be started in. Possible values are: + false: All HTTP methods are permitted - GET/PUT/POST/DELETE. + true: Only the GET method is permitted. + + + + hbase.rest.threads.max + 100 + The maximum number of threads of the REST server thread pool. + Threads in the pool are reused to process REST requests. This + controls the maximum number of requests processed concurrently. + It may help to control the memory used by the REST server to + avoid OOM issues. If the thread pool is full, incoming requests + will be queued up and wait for some free threads. + + + + hbase.rest.threads.min + 2 + The minimum number of threads of the REST server thread pool. + The thread pool always has at least these number of threads so + the REST server is ready to serve incoming requests. + + + + hbase.rest.support.proxyuser + false + Enables running the REST server to support proxy-user mode. + + + hbase.defaults.for.version + 2.4.9 + This defaults file was compiled for version ${project.version}. This variable is used + to make sure that a user doesn't have an old version of hbase-default.xml on the + classpath. + + + + hbase.defaults.for.version.skip + false + Set to true to skip the 'hbase.defaults.for.version' check. + Setting this to true can be useful in contexts other than + the other side of a maven generation; i.e. running in an + IDE. You'll want to set this boolean to true to avoid + seeing the RuntimeException complaint: "hbase-default.xml file + seems to be for and old version of HBase (\${hbase.version}), this + version is X.X.X-SNAPSHOT" + + + + hbase.table.lock.enable + true + Set to true to enable locking the table in zookeeper for schema change operations. + Table locking from master prevents concurrent schema modifications to corrupt table + state. + + + + hbase.table.max.rowsize + 1073741824 + + Maximum size of single row in bytes (default is 1 Gb) for Get'ting + or Scan'ning without in-row scan flag set. If row size exceeds this limit + RowTooBigException is thrown to client. + + + + hbase.thrift.minWorkerThreads + 16 + The "core size" of the thread pool. New threads are created on every + connection until this many threads are created. + + + + hbase.thrift.maxWorkerThreads + 1000 + The maximum size of the thread pool. When the pending request queue + overflows, new threads are created until their number reaches this number. + After that, the server starts dropping connections. + + + + hbase.thrift.maxQueuedRequests + 1000 + The maximum number of pending Thrift connections waiting in the queue. If + there are no idle threads in the pool, the server queues requests. Only + when the queue overflows, new threads are added, up to + hbase.thrift.maxQueuedRequests threads. + + + + hbase.regionserver.thrift.framed + false + Use Thrift TFramedTransport on the server side. + This is the recommended transport for thrift servers and requires a similar setting + on the client side. Changing this to false will select the default transport, + vulnerable to DoS when malformed requests are issued due to THRIFT-601. + + + + hbase.regionserver.thrift.framed.max_frame_size_in_mb + 2 + Default frame size when using framed transport, in MB + + + hbase.regionserver.thrift.compact + false + Use Thrift TCompactProtocol binary serialization protocol. + + + hbase.rootdir.perms + 700 + FS Permissions for the root data subdirectory in a secure (kerberos) setup. + When master starts, it creates the rootdir with this permissions or sets the permissions + if it does not match. + + + + hbase.wal.dir.perms + 700 + FS Permissions for the root WAL directory in a secure(kerberos) setup. + When master starts, it creates the WAL dir with this permissions or sets the permissions + if it does not match. + + + + hbase.data.umask.enable + false + Enable, if true, that file permissions should be assigned + to the files written by the regionserver + + + + hbase.data.umask + 000 + File permissions that should be used to write data + files when hbase.data.umask.enable is true + + + + hbase.snapshot.enabled + true + Set to true to allow snapshots to be taken / restored / cloned. + + + hbase.snapshot.restore.take.failsafe.snapshot + true + Set to true to take a snapshot before the restore operation. + The snapshot taken will be used in case of failure, to restore the previous state. + At the end of the restore operation this snapshot will be deleted + + + + hbase.snapshot.restore.failsafe.name + hbase-failsafe-{snapshot.name}-{restore.timestamp} + Name of the failsafe snapshot taken by the restore operation. + You can use the {snapshot.name}, {table.name} and {restore.timestamp} variables + to create a name based on what you are restoring. + + + + hbase.snapshot.working.dir + + Location where the snapshotting process will occur. The location of the + completed snapshots will not change, but the temporary directory where the snapshot + process occurs will be set to this location. This can be a separate filesystem than + the root directory, for performance increase purposes. See HBASE-21098 for more + information + + + + hbase.server.compactchecker.interval.multiplier + 1000 + The number that determines how often we scan to see if compaction is necessary. + Normally, compactions are done after some events (such as memstore flush), but if + region didn't receive a lot of writes for some time, or due to different compaction + policies, it may be necessary to check it periodically. The interval between checks is + hbase.server.compactchecker.interval.multiplier multiplied by + hbase.server.thread.wakefrequency. + + + + hbase.lease.recovery.timeout + 900000 + How long we wait on dfs lease recovery in total before giving up. + + + hbase.lease.recovery.dfs.timeout + 64000 + How long between dfs recover lease invocations. Should be larger than the sum of + the time it takes for the namenode to issue a block recovery command as part of + datanode; dfs.heartbeat.interval and the time it takes for the primary + datanode, performing block recovery to timeout on a dead datanode; usually + dfs.client.socket-timeout. See the end of HBASE-8389 for more. + + + + hbase.column.max.version + 1 + New column family descriptors will use this value as the default number of versions + to keep. + + + + dfs.client.read.shortcircuit + + + If set to true, this configuration parameter enables short-circuit local + reads. + + + + dfs.domain.socket.path + + + This is a path to a UNIX domain socket that will be used for + communication between the DataNode and local HDFS clients, if + dfs.client.read.shortcircuit is set to true. If the string "_PORT" is + present in this path, it will be replaced by the TCP port of the DataNode. + Be careful about permissions for the directory that hosts the shared + domain socket; dfsclient will complain if open to other users than the HBase user. + + + + hbase.dfs.client.read.shortcircuit.buffer.size + 131072 + If the DFSClient configuration + dfs.client.read.shortcircuit.buffer.size is unset, we will + use what is configured here as the short circuit read default + direct byte buffer size. DFSClient native default is 1MB; HBase + keeps its HDFS files open so number of file blocks * 1MB soon + starts to add up and threaten OOME because of a shortage of + direct memory. So, we set it down from the default. Make + it > the default hbase block size set in the HColumnDescriptor + which is usually 64k. + + + + hbase.regionserver.checksum.verify + true + + If set to true (the default), HBase verifies the checksums for hfile + blocks. HBase writes checksums inline with the data when it writes out + hfiles. HDFS (as of this writing) writes checksums to a separate file + than the data file necessitating extra seeks. Setting this flag saves + some on i/o. Checksum verification by HDFS will be internally disabled + on hfile streams when this flag is set. If the hbase-checksum verification + fails, we will switch back to using HDFS checksums (so do not disable HDFS + checksums! And besides this feature applies to hfiles only, not to WALs). + If this parameter is set to false, then hbase will not verify any checksums, + instead it will depend on checksum verification being done in the HDFS client. + + + + hbase.hstore.bytes.per.checksum + 16384 + + Number of bytes in a newly created checksum chunk for HBase-level + checksums in hfile blocks. + + + + hbase.hstore.checksum.algorithm + CRC32C + + Name of an algorithm that is used to compute checksums. Possible values + are NULL, CRC32, CRC32C. + + + + hbase.client.scanner.max.result.size + 2097152 + Maximum number of bytes returned when calling a scanner's next method. + Note that when a single row is larger than this limit the row is still returned completely. + The default value is 2MB, which is good for 1ge networks. + With faster and/or high latency networks this value should be increased. + + + + hbase.server.scanner.max.result.size + 104857600 + Maximum number of bytes returned when calling a scanner's next method. + Note that when a single row is larger than this limit the row is still returned completely. + The default value is 100MB. + This is a safety setting to protect the server from OOM situations. + + + + hbase.status.published + false + + This setting activates the publication by the master of the status of the region server. + When a region server dies and its recovery starts, the master will push this information + to the client application, to let them cut the connection immediately instead of waiting + for a timeout. + + + + hbase.status.publisher.class + org.apache.hadoop.hbase.master.ClusterStatusPublisher$MulticastPublisher + + Implementation of the status publication with a multicast message. + + + + hbase.status.listener.class + org.apache.hadoop.hbase.client.ClusterStatusListener$MulticastListener + + Implementation of the status listener with a multicast message. + + + + hbase.status.multicast.address.ip + 226.1.1.3 + + Multicast address to use for the status publication by multicast. + + + + hbase.status.multicast.address.port + 16100 + + Multicast port to use for the status publication by multicast. + + + + hbase.dynamic.jars.dir + ${hbase.rootdir}/lib + + The directory from which the custom filter JARs can be loaded + dynamically by the region server without the need to restart. However, + an already loaded filter/co-processor class would not be un-loaded. See + HBASE-1936 for more details. + + Does not apply to coprocessors. + + + + hbase.security.authentication + simple + + Controls whether or not secure authentication is enabled for HBase. + Possible values are 'simple' (no authentication), and 'kerberos'. + + + + hbase.rest.filter.classes + org.apache.hadoop.hbase.rest.filter.GzipFilter + + Servlet filters for REST service. + + + + hbase.master.loadbalancer.class + org.apache.hadoop.hbase.master.balancer.StochasticLoadBalancer + + Class used to execute the regions balancing when the period occurs. + See the class comment for more on how it works + http://hbase.apache.org/devapidocs/org/apache/hadoop/hbase/master/balancer/StochasticLoadBalancer.html + It replaces the DefaultLoadBalancer as the default (since renamed + as the SimpleLoadBalancer). + + + + hbase.master.loadbalance.bytable + false + Factor Table name when the balancer runs. + Default: false. + + + + hbase.master.normalizer.class + org.apache.hadoop.hbase.master.normalizer.SimpleRegionNormalizer + + Class used to execute the region normalization when the period occurs. + See the class comment for more on how it works + http://hbase.apache.org/devapidocs/org/apache/hadoop/hbase/master/normalizer/SimpleRegionNormalizer.html + + + + hbase.rest.csrf.enabled + false + + Set to true to enable protection against cross-site request forgery (CSRF) + + + + hbase.rest-csrf.browser-useragents-regex + ^Mozilla.*,^Opera.* + + A comma-separated list of regular expressions used to match against an HTTP + request's User-Agent header when protection against cross-site request + forgery (CSRF) is enabled for REST server by setting + hbase.rest.csrf.enabled to true. If the incoming User-Agent matches + any of these regular expressions, then the request is considered to be sent + by a browser, and therefore CSRF prevention is enforced. If the request's + User-Agent does not match any of these regular expressions, then the request + is considered to be sent by something other than a browser, such as scripted + automation. In this case, CSRF is not a potential attack vector, so + the prevention is not enforced. This helps achieve backwards-compatibility + with existing automation that has not been updated to send the CSRF + prevention header. + + + + hbase.security.exec.permission.checks + false + + If this setting is enabled and ACL based access control is active (the + AccessController coprocessor is installed either as a system coprocessor + or on a table as a table coprocessor) then you must grant all relevant + users EXEC privilege if they require the ability to execute coprocessor + endpoint calls. EXEC privilege, like any other permission, can be + granted globally to a user, or to a user on a per table or per namespace + basis. For more information on coprocessor endpoints, see the coprocessor + section of the HBase online manual. For more information on granting or + revoking permissions using the AccessController, see the security + section of the HBase online manual. + + + + hbase.procedure.regionserver.classes + + A comma-separated list of + org.apache.hadoop.hbase.procedure.RegionServerProcedureManager procedure managers that are + loaded by default on the active HRegionServer process. The lifecycle methods (init/start/stop) + will be called by the active HRegionServer process to perform the specific globally barriered + procedure. After implementing your own RegionServerProcedureManager, just put it in + HBase's classpath and add the fully qualified class name here. + + + + hbase.procedure.master.classes + + A comma-separated list of + org.apache.hadoop.hbase.procedure.MasterProcedureManager procedure managers that are + loaded by default on the active HMaster process. A procedure is identified by its signature and + users can use the signature and an instant name to trigger an execution of a globally barriered + procedure. After implementing your own MasterProcedureManager, just put it in HBase's classpath + and add the fully qualified class name here. + + + + hbase.coordinated.state.manager.class + org.apache.hadoop.hbase.coordination.ZkCoordinatedStateManager + Fully qualified name of class implementing coordinated state manager. + + + hbase.regionserver.storefile.refresh.period + 0 + + The period (in milliseconds) for refreshing the store files for the secondary regions. 0 + means this feature is disabled. Secondary regions sees new files (from flushes and + compactions) from primary once the secondary region refreshes the list of files in the + region (there is no notification mechanism). But too frequent refreshes might cause + extra Namenode pressure. If the files cannot be refreshed for longer than HFile TTL + (hbase.master.hfilecleaner.ttl) the requests are rejected. Configuring HFile TTL to a larger + value is also recommended with this setting. + + + + hbase.region.replica.replication.enabled + false + + Whether asynchronous WAL replication to the secondary region replicas is enabled or not. + If this is enabled, a replication peer named "region_replica_replication" will be created + which will tail the logs and replicate the mutations to region replicas for tables that + have region replication > 1. If this is enabled once, disabling this replication also + requires disabling the replication peer using shell or Admin java class. + Replication to secondary region replicas works over standard inter-cluster replication. + + + + hbase.http.filter.initializers + org.apache.hadoop.hbase.http.lib.StaticUserWebFilter + + A comma separated list of class names. Each class in the list must extend + org.apache.hadoop.hbase.http.FilterInitializer. The corresponding Filter will + be initialized. Then, the Filter will be applied to all user facing jsp + and servlet web pages. + The ordering of the list defines the ordering of the filters. + The default StaticUserWebFilter add a user principal as defined by the + hbase.http.staticuser.user property. + + + + hbase.security.visibility.mutations.checkauths + false + + This property if enabled, will check whether the labels in the visibility + expression are associated with the user issuing the mutation + + + + hbase.http.max.threads + 16 + + The maximum number of threads that the HTTP Server will create in its + ThreadPool. + + + + hbase.replication.rpc.codec + org.apache.hadoop.hbase.codec.KeyValueCodecWithTags + + The codec that is to be used when replication is enabled so that + the tags are also replicated. This is used along with HFileV3 which + supports tags in them. If tags are not used or if the hfile version used + is HFileV2 then KeyValueCodec can be used as the replication codec. Note that + using KeyValueCodecWithTags for replication when there are no tags causes no harm. + + + + hbase.replication.source.maxthreads + 10 + + The maximum number of threads any replication source will use for + shipping edits to the sinks in parallel. This also limits the number of + chunks each replication batch is broken into. Larger values can improve + the replication throughput between the master and slave clusters. The + default of 10 will rarely need to be changed. + + + + + hbase.http.staticuser.user + dr.stack + + The user name to filter as, on static web filters + while rendering content. An example use is the HDFS + web UI (user to be used for browsing files). + + + + hbase.regionserver.handler.abort.on.error.percent + 0.5 + The percent of region server RPC threads failed to abort RS. + -1 Disable aborting; 0 Abort if even a single handler has died; + 0.x Abort only when this percent of handlers have died; + 1 Abort only all of the handers have died. + + + + + hbase.mob.file.cache.size + 1000 + + Number of opened file handlers to cache. + A larger value will benefit reads by providing more file handlers per mob + file cache and would reduce frequent file opening and closing. + However, if this is set too high, this could lead to a "too many opened file handlers" + The default value is 1000. + + + + hbase.mob.cache.evict.period + 3600 + + The amount of time in seconds before the mob cache evicts cached mob files. + The default value is 3600 seconds. + + + + hbase.mob.cache.evict.remain.ratio + 0.5f + + The ratio (between 0.0 and 1.0) of files that remains cached after an eviction + is triggered when the number of cached mob files exceeds the hbase.mob.file.cache.size. + The default value is 0.5f. + + + + hbase.master.mob.ttl.cleaner.period + 86400 + + The period that ExpiredMobFileCleanerChore runs. The unit is second. + The default value is one day. The MOB file name uses only the date part of + the file creation time in it. We use this time for deciding TTL expiry of + the files. So the removal of TTL expired files might be delayed. The max + delay might be 24 hrs. + + + + hbase.mob.compaction.mergeable.threshold + 1342177280 + + If the size of a mob file is less than this value, it's regarded as a small + file and needs to be merged in mob compaction. The default value is 1280MB. + + + + hbase.mob.delfile.max.count + 3 + + The max number of del files that is allowed in the mob compaction. + In the mob compaction, when the number of existing del files is larger than + this value, they are merged until number of del files is not larger this value. + The default value is 3. + + + + hbase.mob.compaction.batch.size + 100 + + The max number of the mob files that is allowed in a batch of the mob compaction. + The mob compaction merges the small mob files to bigger ones. If the number of the + small files is very large, it could lead to a "too many opened file handlers" in the merge. + And the merge has to be split into batches. This value limits the number of mob files + that are selected in a batch of the mob compaction. The default value is 100. + + + + hbase.mob.compaction.chore.period + 604800 + + The period that MobCompactionChore runs. The unit is second. + The default value is one week. + + + + hbase.mob.compactor.class + org.apache.hadoop.hbase.mob.compactions.PartitionedMobCompactor + + Implementation of mob compactor, the default one is PartitionedMobCompactor. + + + + hbase.mob.compaction.threads.max + 1 + + The max number of threads used in MobCompactor. + + + + hbase.snapshot.master.timeout.millis + 300000 + + Timeout for master for the snapshot procedure execution. + + + + hbase.snapshot.region.timeout + 300000 + + Timeout for regionservers to keep threads in snapshot request pool waiting. + + + + hbase.rpc.rows.warning.threshold + 5000 + + Number of rows in a batch operation above which a warning will be logged. + + + + hbase.master.wait.on.service.seconds + 30 + Default is 5 minutes. Make it 30 seconds for tests. See + HBASE-19794 for some context. + + + + hbase.master.cleaner.snapshot.interval + 1800000 + + Snapshot Cleanup chore interval in milliseconds. + The cleanup thread keeps running at this interval + to find all snapshots that are expired based on TTL + and delete them. + + + + hbase.master.snapshot.ttl + 0 + + Default Snapshot TTL to be considered when the user does not specify TTL while + creating snapshot. Default value 0 indicates FOREVERE - snapshot should not be + automatically deleted until it is manually deleted + + + + hbase.master.regions.recovery.check.interval + 1200000 + + Regions Recovery Chore interval in milliseconds. + This chore keeps running at this interval to + find all regions with configurable max store file ref count + and reopens them. + + + + hbase.regions.recovery.store.file.ref.count + -1 + + Very large number of ref count on a compacted + store file indicates that it is a ref leak + on that object(compacted store file). + Such files can not be removed after + it is invalidated via compaction. + Only way to recover in such scenario is to + reopen the region which can release + all resources, like the refcount, + leases, etc. This config represents Store files Ref + Count threshold value considered for reopening + regions. Any region with compacted store files + ref count > this value would be eligible for + reopening by master. Here, we get the max + refCount among all refCounts on all + compacted away store files that belong to a + particular region. Default value -1 indicates + this feature is turned off. Only positive + integer value should be provided to + enable this feature. + + + + hbase.regionserver.slowlog.ringbuffer.size + 256 + + Default size of ringbuffer to be maintained by each RegionServer in order + to store online slowlog responses. This is an in-memory ring buffer of + requests that were judged to be too slow in addition to the responseTooSlow + logging. The in-memory representation would be complete. + For more details, please look into Doc Section: + Get Slow Response Log from shell + + + + hbase.regionserver.slowlog.buffer.enabled + false + + Indicates whether RegionServers have ring buffer running for storing + Online Slow logs in FIFO manner with limited entries. The size of + the ring buffer is indicated by config: hbase.regionserver.slowlog.ringbuffer.size + The default value is false, turn this on and get latest slowlog + responses with complete data. + + + + hbase.regionserver.slowlog.systable.enabled + false + + Should be enabled only if hbase.regionserver.slowlog.buffer.enabled is enabled. If enabled + (true), all slow/large RPC logs would be persisted to system table hbase:slowlog (in addition + to in-memory ring buffer at each RegionServer). The records are stored in increasing + order of time. Operators can scan the table with various combination of ColumnValueFilter. + More details are provided in the doc section: + "Get Slow/Large Response Logs from System table hbase:slowlog" + + + + hbase.rpc.rows.size.threshold.reject + false + + If value is true, RegionServer will abort batch requests of Put/Delete with number of rows + in a batch operation exceeding threshold defined by value of config: + hbase.rpc.rows.warning.threshold. The default value is false and hence, by default, only + warning will be logged. This config should be turned on to prevent RegionServer from serving + very large batch size of rows and this way we can improve CPU usages by discarding + too large batch request. + + + + hbase.namedqueue.provider.classes + + org.apache.hadoop.hbase.namequeues.impl.SlowLogQueueService,org.apache.hadoop.hbase.namequeues.impl.BalancerDecisionQueueService,org.apache.hadoop.hbase.namequeues.impl.BalancerRejectionQueueService + + + Default values for NamedQueueService implementors. This comma separated full class names + represent all implementors of NamedQueueService that we would like to be invoked by + LogEvent handler service. One example of NamedQueue service is SlowLogQueueService which + is used to store slow/large RPC logs in ringbuffer at each RegionServer. + All implementors of NamedQueueService should be found under package: + "org.apache.hadoop.hbase.namequeues.impl" + + + + hbase.master.balancer.decision.buffer.enabled + false + + Indicates whether active HMaster has ring buffer running for storing + balancer decisions in FIFO manner with limited entries. The size of + the ring buffer is indicated by config: hbase.master.balancer.decision.queue.size + + + + hbase.master.balancer.rejection.buffer.enabled + false + + Indicates whether active HMaster has ring buffer running for storing + balancer rejection in FIFO manner with limited entries. The size of + the ring buffer is indicated by config: hbase.master.balancer.rejection.queue.size + + + diff --git a/hudi-common/src/test/java/org/apache/hudi/common/fs/inline/TestInLineFileSystem.java b/hudi-common/src/test/java/org/apache/hudi/common/fs/inline/TestInLineFileSystem.java index 92f83aad7fd7..0f364eddbc61 100644 --- a/hudi-common/src/test/java/org/apache/hudi/common/fs/inline/TestInLineFileSystem.java +++ b/hudi-common/src/test/java/org/apache/hudi/common/fs/inline/TestInLineFileSystem.java @@ -369,7 +369,8 @@ private Path getRandomInlinePath() { private void verifyFileStatus(FileStatus expected, Path inlinePath, long expectedLength, FileStatus actual) { assertEquals(inlinePath, actual.getPath()); assertEquals(expectedLength, actual.getLen()); - assertEquals(expected.getAccessTime(), actual.getAccessTime()); + // removing below assertion as it is flaky on rare occasion (difference is in single-digit ms) + // assertEquals(expected.getAccessTime(), actual.getAccessTime()); assertEquals(expected.getBlockSize(), actual.getBlockSize()); assertEquals(expected.getGroup(), actual.getGroup()); assertEquals(expected.getModificationTime(), actual.getModificationTime()); diff --git a/hudi-common/src/test/java/org/apache/hudi/common/fs/inline/TestInLineFileSystemHFileInLining.java b/hudi-common/src/test/java/org/apache/hudi/common/fs/inline/TestInLineFileSystemHFileInLining.java index cc59b4602479..f09ecf76b2d8 100644 --- a/hudi-common/src/test/java/org/apache/hudi/common/fs/inline/TestInLineFileSystemHFileInLining.java +++ b/hudi-common/src/test/java/org/apache/hudi/common/fs/inline/TestInLineFileSystemHFileInLining.java @@ -19,12 +19,12 @@ package org.apache.hudi.common.fs.inline; import org.apache.hudi.common.testutils.FileSystemTestUtils; -import org.apache.hudi.io.storage.HoodieHBaseKVComparator; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FSDataInputStream; import org.apache.hadoop.fs.FSDataOutputStream; import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hbase.Cell; import org.apache.hadoop.hbase.HConstants; import org.apache.hadoop.hbase.KeyValue; import org.apache.hadoop.hbase.io.hfile.CacheConfig; @@ -39,10 +39,12 @@ import java.io.File; import java.io.IOException; import java.nio.ByteBuffer; +import java.util.Arrays; import java.util.HashSet; import java.util.Set; import java.util.UUID; +import static org.apache.hadoop.hbase.CellComparatorImpl.COMPARATOR; import static org.apache.hudi.common.testutils.FileSystemTestUtils.FILE_SCHEME; import static org.apache.hudi.common.testutils.FileSystemTestUtils.RANDOM; import static org.apache.hudi.common.testutils.FileSystemTestUtils.getPhantomFile; @@ -56,11 +58,12 @@ */ public class TestInLineFileSystemHFileInLining { + private static final String LOCAL_FORMATTER = "%010d"; + private static final String VALUE_PREFIX = "value"; + private static final int MIN_BLOCK_SIZE = 1024; private final Configuration inMemoryConf; private final Configuration inlineConf; - private final int minBlockSize = 1024; - private static final String LOCAL_FORMATTER = "%010d"; - private int maxRows = 100 + RANDOM.nextInt(1000); + private final int maxRows = 100 + RANDOM.nextInt(1000); private Path generatedPath; public TestInLineFileSystemHFileInLining() { @@ -88,12 +91,11 @@ public void testSimpleInlineFileSystem() throws IOException { CacheConfig cacheConf = new CacheConfig(inMemoryConf); FSDataOutputStream fout = createFSOutput(outerInMemFSPath, inMemoryConf); HFileContext meta = new HFileContextBuilder() - .withBlockSize(minBlockSize) + .withBlockSize(MIN_BLOCK_SIZE).withCellComparator(COMPARATOR) .build(); HFile.Writer writer = HFile.getWriterFactory(inMemoryConf, cacheConf) .withOutputStream(fout) .withFileContext(meta) - .withComparator(new HoodieHBaseKVComparator()) .create(); writeRecords(writer); @@ -110,9 +112,9 @@ public void testSimpleInlineFileSystem() throws IOException { InLineFileSystem inlineFileSystem = (InLineFileSystem) inlinePath.getFileSystem(inlineConf); FSDataInputStream fin = inlineFileSystem.open(inlinePath); - HFile.Reader reader = HFile.createReader(inlineFileSystem, inlinePath, cacheConf, inlineConf); + HFile.Reader reader = HFile.createReader(inlineFileSystem, inlinePath, cacheConf, true, inlineConf); // Load up the index. - reader.loadFileInfo(); + reader.getHFileInfo(); // Get a scanner that caches and that does not use pread. HFileScanner scanner = reader.getScanner(true, false); // Align scanner at start of the file. @@ -121,21 +123,24 @@ public void testSimpleInlineFileSystem() throws IOException { Set rowIdsToSearch = getRandomValidRowIds(10); for (int rowId : rowIdsToSearch) { - assertEquals(0, scanner.seekTo(KeyValue.createKeyValueFromKey(getSomeKey(rowId))), + KeyValue keyValue = new KeyValue.KeyOnlyKeyValue(getSomeKey(rowId)); + assertEquals(0, scanner.seekTo(keyValue), "location lookup failed"); // read the key and see if it matches - ByteBuffer readKey = scanner.getKey(); - assertArrayEquals(getSomeKey(rowId), Bytes.toBytes(readKey), "seeked key does not match"); - scanner.seekTo(KeyValue.createKeyValueFromKey(getSomeKey(rowId))); + Cell cell = scanner.getCell(); + byte[] key = Arrays.copyOfRange(cell.getRowArray(), cell.getRowOffset(), cell.getRowOffset() + cell.getRowLength()); + assertArrayEquals(Arrays.copyOfRange(keyValue.getRowArray(), keyValue.getRowOffset(), keyValue.getRowOffset() + keyValue.getRowLength()), key, + "seeked key does not match"); + scanner.seekTo(keyValue); ByteBuffer val1 = scanner.getValue(); - scanner.seekTo(KeyValue.createKeyValueFromKey(getSomeKey(rowId))); + scanner.seekTo(keyValue); ByteBuffer val2 = scanner.getValue(); assertArrayEquals(Bytes.toBytes(val1), Bytes.toBytes(val2)); } int[] invalidRowIds = {-4, maxRows, maxRows + 1, maxRows + 120, maxRows + 160, maxRows + 1000}; for (int rowId : invalidRowIds) { - assertNotEquals(0, scanner.seekTo(KeyValue.createKeyValueFromKey(getSomeKey(rowId))), + assertNotEquals(0, scanner.seekTo(new KeyValue.KeyOnlyKeyValue(getSomeKey(rowId))), "location lookup should have failed"); } reader.close(); @@ -155,7 +160,7 @@ private Set getRandomValidRowIds(int count) { } private byte[] getSomeKey(int rowId) { - KeyValue kv = new KeyValue(String.format(LOCAL_FORMATTER, Integer.valueOf(rowId)).getBytes(), + KeyValue kv = new KeyValue(String.format(LOCAL_FORMATTER, rowId).getBytes(), Bytes.toBytes("family"), Bytes.toBytes("qual"), HConstants.LATEST_TIMESTAMP, KeyValue.Type.Put); return kv.getKey(); } @@ -169,17 +174,15 @@ private void writeRecords(HFile.Writer writer) throws IOException { writer.close(); } - private int writeSomeRecords(HFile.Writer writer) + private void writeSomeRecords(HFile.Writer writer) throws IOException { - String value = "value"; KeyValue kv; for (int i = 0; i < (maxRows); i++) { - String key = String.format(LOCAL_FORMATTER, Integer.valueOf(i)); + String key = String.format(LOCAL_FORMATTER, i); kv = new KeyValue(Bytes.toBytes(key), Bytes.toBytes("family"), Bytes.toBytes("qual"), - Bytes.toBytes(value + key)); + Bytes.toBytes(VALUE_PREFIX + key)); writer.append(kv); } - return (maxRows); } private void readAllRecords(HFileScanner scanner) throws IOException { @@ -187,30 +190,27 @@ private void readAllRecords(HFileScanner scanner) throws IOException { } // read the records and check - private int readAndCheckbytes(HFileScanner scanner, int start, int n) + private void readAndCheckbytes(HFileScanner scanner, int start, int n) throws IOException { - String value = "value"; int i = start; for (; i < (start + n); i++) { - ByteBuffer key = scanner.getKey(); - ByteBuffer val = scanner.getValue(); - String keyStr = String.format(LOCAL_FORMATTER, Integer.valueOf(i)); - String valStr = value + keyStr; + Cell cell = scanner.getCell(); + byte[] key = Arrays.copyOfRange(cell.getRowArray(), cell.getRowOffset(), cell.getRowOffset() + cell.getRowLength()); + byte[] val = Arrays.copyOfRange(cell.getValueArray(), cell.getValueOffset(), cell.getValueOffset() + cell.getValueLength()); + String keyStr = String.format(LOCAL_FORMATTER, i); + String valStr = VALUE_PREFIX + keyStr; KeyValue kv = new KeyValue(Bytes.toBytes(keyStr), Bytes.toBytes("family"), Bytes.toBytes("qual"), Bytes.toBytes(valStr)); - byte[] keyBytes = new KeyValue.KeyOnlyKeyValue(Bytes.toBytes(key), 0, - Bytes.toBytes(key).length).getKey(); - assertArrayEquals(kv.getKey(), keyBytes, - "bytes for keys do not match " + keyStr + " " + Bytes.toString(Bytes.toBytes(key))); - byte[] valBytes = Bytes.toBytes(val); - assertArrayEquals(Bytes.toBytes(valStr), valBytes, - "bytes for vals do not match " + valStr + " " + Bytes.toString(valBytes)); + byte[] keyBytes = new KeyValue.KeyOnlyKeyValue(key, 0, key.length).getKey(); + assertArrayEquals(Arrays.copyOfRange(kv.getRowArray(), kv.getRowOffset(), kv.getRowOffset() + kv.getRowLength()), keyBytes, + "bytes for keys do not match " + keyStr + " " + Bytes.toString(key)); + assertArrayEquals(Bytes.toBytes(valStr), val, + "bytes for vals do not match " + valStr + " " + Bytes.toString(val)); if (!scanner.next()) { break; } } assertEquals(i, start + n - 1); - return (start + n); } private long generateOuterFile(Path outerPath, byte[] inlineBytes) throws IOException { diff --git a/hudi-hadoop-mr/pom.xml b/hudi-hadoop-mr/pom.xml index bf87bfaa36a8..2b24f6a99ea9 100644 --- a/hudi-hadoop-mr/pom.xml +++ b/hudi-hadoop-mr/pom.xml @@ -128,11 +128,6 @@ - - - src/main/resources - - org.apache.rat diff --git a/hudi-integ-test/src/test/java/org/apache/hudi/integ/ITTestBase.java b/hudi-integ-test/src/test/java/org/apache/hudi/integ/ITTestBase.java index 3c7a6034b4f4..f2a5395af32e 100644 --- a/hudi-integ-test/src/test/java/org/apache/hudi/integ/ITTestBase.java +++ b/hudi-integ-test/src/test/java/org/apache/hudi/integ/ITTestBase.java @@ -62,6 +62,7 @@ public abstract class ITTestBase { protected static final String ADHOC_2_CONTAINER = "/adhoc-2"; protected static final String HIVESERVER = "/hiveserver"; protected static final String PRESTO_COORDINATOR = "/presto-coordinator-1"; + protected static final String TRINO_COORDINATOR = "/trino-coordinator-1"; protected static final String HOODIE_WS_ROOT = "/var/hoodie/ws"; protected static final String HOODIE_JAVA_APP = HOODIE_WS_ROOT + "/hudi-spark-datasource/hudi-spark/run_hoodie_app.sh"; protected static final String HOODIE_GENERATE_APP = HOODIE_WS_ROOT + "/hudi-spark-datasource/hudi-spark/run_hoodie_generate_app.sh"; @@ -76,6 +77,7 @@ public abstract class ITTestBase { HOODIE_WS_ROOT + "/docker/hoodie/hadoop/hive_base/target/hoodie-utilities.jar"; protected static final String HIVE_SERVER_JDBC_URL = "jdbc:hive2://hiveserver:10000"; protected static final String PRESTO_COORDINATOR_URL = "presto-coordinator-1:8090"; + protected static final String TRINO_COORDINATOR_URL = "trino-coordinator-1:8091"; protected static final String HADOOP_CONF_DIR = "/etc/hadoop"; // Skip these lines when capturing output from hive @@ -122,6 +124,12 @@ static String getPrestoConsoleCommand(String commandFile) { .append(" -f " + commandFile).toString(); } + static String getTrinoConsoleCommand(String commandFile) { + return new StringBuilder().append("trino --server " + TRINO_COORDINATOR_URL) + .append(" --catalog hive --schema default") + .append(" -f " + commandFile).toString(); + } + @BeforeEach public void init() { String dockerHost = (OVERRIDDEN_DOCKER_HOST != null) ? OVERRIDDEN_DOCKER_HOST : DEFAULT_DOCKER_HOST; @@ -213,9 +221,11 @@ private TestExecStartResultCallback executeCommandInDocker(String containerName, // Each execution of command(s) in docker should not be more than 15 mins. Otherwise, it is deemed stuck. We will // try to capture stdout and stderr of the stuck process. + LOG.error("containerName: " + containerName); + LOG.error("Command: " + Arrays.asList(command)); boolean completed = dockerClient.execStartCmd(createCmdResponse.getId()).withDetach(false).withTty(false).exec(callback) - .awaitCompletion(540, SECONDS); + .awaitCompletion(540, SECONDS); if (!completed) { callback.getStderr().flush(); callback.getStdout().flush(); @@ -228,8 +238,11 @@ private TestExecStartResultCallback executeCommandInDocker(String containerName, int exitCode = dockerClient.inspectExecCmd(createCmdResponse.getId()).exec().getExitCode(); LOG.info("Exit code for command : " + exitCode); if (exitCode != 0) { - LOG.error("\n\n ###### Stdout #######\n" + callback.getStdout().toString()); + //LOG.error("\n\n ###### Stdout #######\n" + callback.getStdout().toString()); } + callback.getStderr().flush(); + callback.getStdout().flush(); + LOG.error("\n\n ###### Stdout #######\n" + callback.getStdout().toString()); LOG.error("\n\n ###### Stderr #######\n" + callback.getStderr().toString()); if (checkIfSucceed) { @@ -309,6 +322,20 @@ void executePrestoCopyCommand(String fromFile, String remotePath) { .exec(); } + Pair executeTrinoCommandFile(String commandFile) throws Exception { + String trinoCmd = getTrinoConsoleCommand(commandFile); + TestExecStartResultCallback callback = executeCommandStringInDocker(ADHOC_1_CONTAINER, trinoCmd, true); + return Pair.of(callback.getStdout().toString().trim(), callback.getStderr().toString().trim()); + } + + void executeTrinoCopyCommand(String fromFile, String remotePath) { + Container adhocContainer = runningContainers.get(ADHOC_1_CONTAINER); + dockerClient.copyArchiveToContainerCmd(adhocContainer.getId()) + .withHostResource(fromFile) + .withRemotePath(remotePath) + .exec(); + } + private void saveUpLogs() { try { // save up the Hive log files for introspection @@ -316,8 +343,8 @@ private void saveUpLogs() { executeCommandStringInDocker(HIVESERVER, "cat /tmp/root/hive.log | grep -i exception -A 10 -B 5", false).getStdout().toString(); String filePath = System.getProperty("java.io.tmpdir") + "/" + System.currentTimeMillis() + "-hive.log"; FileIOUtils.writeStringToFile(hiveLogStr, filePath); - LOG.info("Hive log saved up at : " + filePath); - LOG.info("<=========== Full hive log ===============>\n" + LOG.error("Hive log saved up at : " + filePath); + LOG.error("<=========== Full hive log ===============>\n" + "\n" + hiveLogStr + "\n <==========================================>"); } catch (Exception e) { @@ -334,6 +361,11 @@ void assertStdOutContains(Pair stdOutErr, String expectedOutput, String stdOutSingleSpaced = singleSpace(stdOutErr.getLeft()).replaceAll(" ", ""); expectedOutput = singleSpace(expectedOutput).replaceAll(" ", ""); + LOG.error("stdOutErr : " + stdOutErr.getLeft()); + LOG.error("stdOutErr.getRight : " + stdOutErr.getRight()); + LOG.error("stdOutSingleSpaced : " + stdOutSingleSpaced); + LOG.error("expectedOutput : " + expectedOutput); + int lastIndex = 0; int count = 0; while (lastIndex != -1) { diff --git a/hudi-integ-test/src/test/java/org/apache/hudi/integ/ITTestHoodieDemo.java b/hudi-integ-test/src/test/java/org/apache/hudi/integ/ITTestHoodieDemo.java index b68d06a64329..f142ebd502f1 100644 --- a/hudi-integ-test/src/test/java/org/apache/hudi/integ/ITTestHoodieDemo.java +++ b/hudi-integ-test/src/test/java/org/apache/hudi/integ/ITTestHoodieDemo.java @@ -21,7 +21,6 @@ import org.apache.hudi.common.model.HoodieFileFormat; import org.apache.hudi.common.util.CollectionUtils; import org.apache.hudi.common.util.collection.Pair; - import org.apache.hudi.keygen.SimpleKeyGenerator; import org.junit.jupiter.api.AfterEach; @@ -33,24 +32,34 @@ /** * Goes through steps described in https://hudi.apache.org/docker_demo.html - * + *

* To run this as a standalone test in the IDE or command line. First bring up the demo setup using * `docker/setup_demo.sh` and then run the test class as you would do normally. */ public class ITTestHoodieDemo extends ITTestBase { + private static final String TRINO_TABLE_CHECK_FILENAME = "trino-table-check.commands"; + private static final String TRINO_BATCH1_FILENAME = "trino-batch1.commands"; + private static final String TRINO_BATCH2_FILENAME = "trino-batch2-after-compaction.commands"; + private static final String HDFS_DATA_DIR = "/usr/hive/data/input"; private static final String HDFS_BATCH_PATH1 = HDFS_DATA_DIR + "/batch_1.json"; private static final String HDFS_BATCH_PATH2 = HDFS_DATA_DIR + "/batch_2.json"; private static final String HDFS_PRESTO_INPUT_TABLE_CHECK_PATH = HDFS_DATA_DIR + "/presto-table-check.commands"; private static final String HDFS_PRESTO_INPUT_BATCH1_PATH = HDFS_DATA_DIR + "/presto-batch1.commands"; private static final String HDFS_PRESTO_INPUT_BATCH2_PATH = HDFS_DATA_DIR + "/presto-batch2-after-compaction.commands"; + private static final String HDFS_TRINO_INPUT_TABLE_CHECK_PATH = HDFS_DATA_DIR + "/" + TRINO_TABLE_CHECK_FILENAME; + private static final String HDFS_TRINO_INPUT_BATCH1_PATH = HDFS_DATA_DIR + "/" + TRINO_BATCH1_FILENAME; + private static final String HDFS_TRINO_INPUT_BATCH2_PATH = HDFS_DATA_DIR + "/" + TRINO_BATCH2_FILENAME; private static final String INPUT_BATCH_PATH1 = HOODIE_WS_ROOT + "/docker/demo/data/batch_1.json"; private static final String PRESTO_INPUT_TABLE_CHECK_RELATIVE_PATH = "/docker/demo/presto-table-check.commands"; private static final String PRESTO_INPUT_BATCH1_RELATIVE_PATH = "/docker/demo/presto-batch1.commands"; private static final String INPUT_BATCH_PATH2 = HOODIE_WS_ROOT + "/docker/demo/data/batch_2.json"; private static final String PRESTO_INPUT_BATCH2_RELATIVE_PATH = "/docker/demo/presto-batch2-after-compaction.commands"; + private static final String TRINO_INPUT_TABLE_CHECK_RELATIVE_PATH = "/docker/demo/" + TRINO_TABLE_CHECK_FILENAME; + private static final String TRINO_INPUT_BATCH1_RELATIVE_PATH = "/docker/demo/" + TRINO_BATCH1_FILENAME; + private static final String TRINO_INPUT_BATCH2_RELATIVE_PATH = "/docker/demo/" + TRINO_BATCH2_FILENAME; private static final String COW_BASE_PATH = "/user/hive/warehouse/stock_ticks_cow"; private static final String MOR_BASE_PATH = "/user/hive/warehouse/stock_ticks_mor"; @@ -110,12 +119,14 @@ public void testParquetDemo() throws Exception { ingestFirstBatchAndHiveSync(); testHiveAfterFirstBatch(); testPrestoAfterFirstBatch(); + testTrinoAfterFirstBatch(); testSparkSQLAfterFirstBatch(); // batch 2 ingestSecondBatchAndHiveSync(); testHiveAfterSecondBatch(); testPrestoAfterSecondBatch(); + testTrinoAfterSecondBatch(); testSparkSQLAfterSecondBatch(); testIncrementalHiveQueryBeforeCompaction(); testIncrementalSparkSQLQuery(); @@ -125,6 +136,7 @@ public void testParquetDemo() throws Exception { testHiveAfterSecondBatchAfterCompaction(); testPrestoAfterSecondBatchAfterCompaction(); + testTrinoAfterSecondBatchAfterCompaction(); testIncrementalHiveQueryAfterCompaction(); } @@ -133,7 +145,7 @@ public void testParquetDemo() throws Exception { public void testHFileDemo() throws Exception { baseFileFormat = HoodieFileFormat.HFILE; - // TODO: Preseto and SparkSQL support for HFile format + // TODO: Presto, Trino and SparkSQL support for HFile format setupDemo(); @@ -141,12 +153,14 @@ public void testHFileDemo() throws Exception { ingestFirstBatchAndHiveSync(); testHiveAfterFirstBatch(); //testPrestoAfterFirstBatch(); + //testTrinoAfterFirstBatch(); //testSparkSQLAfterFirstBatch(); // batch 2 ingestSecondBatchAndHiveSync(); testHiveAfterSecondBatch(); //testPrestoAfterSecondBatch(); + //testTrinoAfterSecondBatch(); //testSparkSQLAfterSecondBatch(); testIncrementalHiveQueryBeforeCompaction(); //testIncrementalSparkSQLQuery(); @@ -155,6 +169,7 @@ public void testHFileDemo() throws Exception { scheduleAndRunCompaction(); testHiveAfterSecondBatchAfterCompaction(); //testPrestoAfterSecondBatchAfterCompaction(); + //testTrinoAfterSecondBatchAfterCompaction(); //testIncrementalHiveQueryAfterCompaction(); } @@ -162,7 +177,8 @@ private void setupDemo() throws Exception { List cmds = CollectionUtils.createImmutableList("hdfs dfsadmin -safemode wait", "hdfs dfs -mkdir -p " + HDFS_DATA_DIR, "hdfs dfs -copyFromLocal -f " + INPUT_BATCH_PATH1 + " " + HDFS_BATCH_PATH1, - "/bin/bash " + DEMO_CONTAINER_SCRIPT); + "/bin/bash " + DEMO_CONTAINER_SCRIPT, + "mkdir -p " + HDFS_DATA_DIR); executeCommandStringsInDocker(ADHOC_1_CONTAINER, cmds); @@ -174,6 +190,10 @@ private void setupDemo() throws Exception { executePrestoCopyCommand(System.getProperty("user.dir") + "/.." + PRESTO_INPUT_TABLE_CHECK_RELATIVE_PATH, HDFS_DATA_DIR); executePrestoCopyCommand(System.getProperty("user.dir") + "/.." + PRESTO_INPUT_BATCH1_RELATIVE_PATH, HDFS_DATA_DIR); executePrestoCopyCommand(System.getProperty("user.dir") + "/.." + PRESTO_INPUT_BATCH2_RELATIVE_PATH, HDFS_DATA_DIR); + + executeTrinoCopyCommand(System.getProperty("user.dir") + "/.." + TRINO_INPUT_TABLE_CHECK_RELATIVE_PATH, HDFS_DATA_DIR); + executeTrinoCopyCommand(System.getProperty("user.dir") + "/.." + TRINO_INPUT_BATCH1_RELATIVE_PATH, HDFS_DATA_DIR); + executeTrinoCopyCommand(System.getProperty("user.dir") + "/.." + TRINO_INPUT_BATCH2_RELATIVE_PATH, HDFS_DATA_DIR); } private void ingestFirstBatchAndHiveSync() throws Exception { @@ -335,6 +355,20 @@ private void testPrestoAfterFirstBatch() throws Exception { "\"GOOG\",\"2018-08-31 10:29:00\",\"3391\",\"1230.1899\",\"1230.085\"", 2); } + private void testTrinoAfterFirstBatch() throws Exception { + Pair stdOutErrPair = executeTrinoCommandFile(HDFS_TRINO_INPUT_TABLE_CHECK_PATH); + assertStdOutContains(stdOutErrPair, "stock_ticks_cow", 2); + assertStdOutContains(stdOutErrPair, "stock_ticks_mor", 4); + + stdOutErrPair = executeTrinoCommandFile(HDFS_TRINO_INPUT_BATCH1_PATH); + assertStdOutContains(stdOutErrPair, + "\"GOOG\",\"2018-08-31 10:29:00\"", 4); + assertStdOutContains(stdOutErrPair, + "\"GOOG\",\"2018-08-31 09:59:00\",\"6330\",\"1230.5\",\"1230.02\"", 2); + assertStdOutContains(stdOutErrPair, + "\"GOOG\",\"2018-08-31 10:29:00\",\"3391\",\"1230.1899\",\"1230.085\"", 2); + } + private void testHiveAfterSecondBatch() throws Exception { Pair stdOutErrPair = executeHiveCommandFile(HIVE_BATCH1_COMMANDS); assertStdOutContains(stdOutErrPair, "| symbol | _c1 |\n+---------+----------------------+\n" @@ -361,7 +395,21 @@ private void testPrestoAfterSecondBatch() throws Exception { assertStdOutContains(stdOutErrPair, "\"GOOG\",\"2018-08-31 10:59:00\"", 2); assertStdOutContains(stdOutErrPair, - "\"GOOG\",\"2018-08-31 09:59:00\",\"6330\",\"1230.5\",\"1230.02\"",2); + "\"GOOG\",\"2018-08-31 09:59:00\",\"6330\",\"1230.5\",\"1230.02\"", 2); + assertStdOutContains(stdOutErrPair, + "\"GOOG\",\"2018-08-31 10:29:00\",\"3391\",\"1230.1899\",\"1230.085\""); + assertStdOutContains(stdOutErrPair, + "\"GOOG\",\"2018-08-31 10:59:00\",\"9021\",\"1227.1993\",\"1227.215\""); + } + + private void testTrinoAfterSecondBatch() throws Exception { + Pair stdOutErrPair = executeTrinoCommandFile(HDFS_TRINO_INPUT_BATCH1_PATH); + assertStdOutContains(stdOutErrPair, + "\"GOOG\",\"2018-08-31 10:29:00\"", 2); + assertStdOutContains(stdOutErrPair, + "\"GOOG\",\"2018-08-31 10:59:00\"", 2); + assertStdOutContains(stdOutErrPair, + "\"GOOG\",\"2018-08-31 09:59:00\",\"6330\",\"1230.5\",\"1230.02\"", 2); assertStdOutContains(stdOutErrPair, "\"GOOG\",\"2018-08-31 10:29:00\",\"3391\",\"1230.1899\",\"1230.085\""); assertStdOutContains(stdOutErrPair, @@ -390,6 +438,16 @@ private void testPrestoAfterSecondBatchAfterCompaction() throws Exception { "\"GOOG\",\"2018-08-31 10:59:00\",\"9021\",\"1227.1993\",\"1227.215\""); } + private void testTrinoAfterSecondBatchAfterCompaction() throws Exception { + Pair stdOutErrPair = executeTrinoCommandFile(HDFS_TRINO_INPUT_BATCH2_PATH); + assertStdOutContains(stdOutErrPair, + "\"GOOG\",\"2018-08-31 10:59:00\"", 2); + assertStdOutContains(stdOutErrPair, + "\"GOOG\",\"2018-08-31 09:59:00\",\"6330\",\"1230.5\",\"1230.02\""); + assertStdOutContains(stdOutErrPair, + "\"GOOG\",\"2018-08-31 10:59:00\",\"9021\",\"1227.1993\",\"1227.215\""); + } + private void testSparkSQLAfterSecondBatch() throws Exception { Pair stdOutErrPair = executeSparkSQLCommand(SPARKSQL_BATCH2_COMMANDS, true); assertStdOutContains(stdOutErrPair, diff --git a/packaging/hudi-flink-bundle/pom.xml b/packaging/hudi-flink-bundle/pom.xml index 222478090b4b..2e517e67efd0 100644 --- a/packaging/hudi-flink-bundle/pom.xml +++ b/packaging/hudi-flink-bundle/pom.xml @@ -70,6 +70,7 @@ META-INF/LICENSE target/classes/META-INF/LICENSE + @@ -137,7 +138,7 @@ org.apache.hive:hive-service org.apache.hive:hive-service-rpc org.apache.hive:hive-exec - org.apache.hive:hive-standalone-metastore + org.apache.hive:hive-standalone-metastore org.apache.hive:hive-metastore org.apache.hive:hive-jdbc org.datanucleus:datanucleus-core @@ -147,10 +148,18 @@ org.apache.hbase:hbase-common org.apache.hbase:hbase-client + org.apache.hbase:hbase-hadoop-compat + org.apache.hbase:hbase-hadoop2-compat + org.apache.hbase:hbase-metrics + org.apache.hbase:hbase-metrics-api org.apache.hbase:hbase-server - org.apache.hbase:hbase-protocol - org.apache.htrace:htrace-core + org.apache.hbase:hbase-protocol-shaded + org.apache.hbase.thirdparty:hbase-shaded-miscellaneous + org.apache.hbase.thirdparty:hbase-shaded-netty + org.apache.hbase.thirdparty:hbase-shaded-protobuf + org.apache.htrace:htrace-core4 commons-codec:commons-codec + commons-io:commons-io @@ -162,6 +171,22 @@ org.apache.avro. ${flink.bundle.shade.prefix}org.apache.avro. + + org.apache.commons.io. + org.apache.hudi.org.apache.commons.io. + + + org.apache.hadoop.hbase. + org.apache.hudi.org.apache.hadoop.hbase. + + + org.apache.hbase. + org.apache.hudi.org.apache.hbase. + + + org.apache.htrace. + org.apache.hudi.org.apache.htrace. + com.yammer.metrics. ${flink.bundle.shade.prefix}com.yammer.metrics. @@ -191,6 +216,70 @@ com.fasterxml.jackson. ${flink.bundle.shade.prefix}com.fasterxml.jackson. + + org.apache.hadoop.metrics2.MetricHistogram + org.apache.hudi.org.apache.hadoop.metrics2.MetricHistogram + + + + org.apache.hadoop.metrics2.MetricsExecutor + org.apache.hudi.org.apache.hadoop.metrics2.MetricsExecutor + + + + org.apache.hadoop.metrics2.impl.JmxCacheBuster + org.apache.hudi.org.apache.hadoop.metrics2.impl.JmxCacheBuster + + + org.apache.hadoop.metrics2.lib.DefaultMetricsSystemHelper + org.apache.hudi.org.apache.hadoop.metrics2.lib.DefaultMetricsSystemHelper + + + + org.apache.hadoop.metrics2.lib.DynamicMetricsRegistry + org.apache.hudi.org.apache.hadoop.metrics2.lib.DynamicMetricsRegistry + + + + org.apache.hadoop.metrics2.lib.MetricsExecutorImpl + org.apache.hudi.org.apache.hadoop.metrics2.lib.MetricsExecutorImpl + + + + org.apache.hadoop.metrics2.lib.MutableFastCounter + org.apache.hudi.org.apache.hadoop.metrics2.lib.MutableFastCounter + + + + org.apache.hadoop.metrics2.lib.MutableHistogram + org.apache.hudi.org.apache.hadoop.metrics2.lib.MutableHistogram + + + + org.apache.hadoop.metrics2.lib.MutableRangeHistogram + org.apache.hudi.org.apache.hadoop.metrics2.lib.MutableRangeHistogram + + + + org.apache.hadoop.metrics2.lib.MutableSizeHistogram + org.apache.hudi.org.apache.hadoop.metrics2.lib.MutableSizeHistogram + + + + org.apache.hadoop.metrics2.lib.MutableTimeHistogram + org.apache.hudi.org.apache.hadoop.metrics2.lib.MutableTimeHistogram + + + + org.apache.hadoop.metrics2.util.MetricQuantile + org.apache.hudi.org.apache.hadoop.metrics2.util.MetricQuantile + + + + org.apache.hadoop.metrics2.util.MetricSampleQuantiles + org.apache.hudi.org.apache.hadoop.metrics2.util.MetricSampleQuantiles + + @@ -199,7 +288,14 @@ META-INF/*.SF META-INF/*.DSA META-INF/*.RSA + META-INF/maven/com.google.protobuf/** + META-INF/maven/commons-io/** + META-INF/maven/org.apache.hbase/** + META-INF/maven/org.apache.hbase.thirdparty/** + META-INF/maven/org.apache.htrace/** META-INF/services/javax.* + **/*.proto + hbase-webapps/** @@ -597,9 +693,29 @@ org.apache.hbase - hbase-protocol + hbase-protocol-shaded ${hbase.version} + + org.apache.hbase.thirdparty + hbase-shaded-miscellaneous + ${hbase-thirdparty.version} + + + org.apache.hbase.thirdparty + hbase-shaded-netty + ${hbase-thirdparty.version} + + + org.apache.hbase + hbase-hadoop-compat + ${hbase.version} + + + org.apache.hbase.thirdparty + hbase-shaded-protobuf + ${hbase-thirdparty.version} + org.apache.htrace htrace-core diff --git a/packaging/hudi-hadoop-mr-bundle/pom.xml b/packaging/hudi-hadoop-mr-bundle/pom.xml index f6215b1e017a..5311e7479df0 100644 --- a/packaging/hudi-hadoop-mr-bundle/pom.xml +++ b/packaging/hudi-hadoop-mr-bundle/pom.xml @@ -55,12 +55,13 @@ - true + true META-INF/LICENSE target/classes/META-INF/LICENSE + @@ -74,11 +75,19 @@ com.esotericsoftware:minlog org.apache.hbase:hbase-common org.apache.hbase:hbase-client - org.apache.hbase:hbase-protocol + org.apache.hbase:hbase-hadoop-compat + org.apache.hbase:hbase-hadoop2-compat + org.apache.hbase:hbase-metrics + org.apache.hbase:hbase-metrics-api + org.apache.hbase:hbase-protocol-shaded org.apache.hbase:hbase-server - org.apache.htrace:htrace-core + org.apache.hbase.thirdparty:hbase-shaded-miscellaneous + org.apache.hbase.thirdparty:hbase-shaded-netty + org.apache.hbase.thirdparty:hbase-shaded-protobuf + org.apache.htrace:htrace-core4 com.yammer.metrics:metrics-core com.google.guava:guava + commons-io:commons-io @@ -102,6 +111,22 @@ org.apache.avro. org.apache.hudi.org.apache.avro. + + org.apache.commons.io. + org.apache.hudi.org.apache.commons.io. + + + org.apache.hadoop.hbase. + org.apache.hudi.org.apache.hadoop.hbase. + + + org.apache.hbase. + org.apache.hudi.org.apache.hbase. + + + org.apache.htrace. + org.apache.hudi.org.apache.htrace. + org.apache.parquet.avro. org.apache.hudi.org.apache.parquet.avro. @@ -110,6 +135,70 @@ com.google.common. org.apache.hudi.com.google.common. + + org.apache.hadoop.metrics2.MetricHistogram + org.apache.hudi.org.apache.hadoop.metrics2.MetricHistogram + + + + org.apache.hadoop.metrics2.MetricsExecutor + org.apache.hudi.org.apache.hadoop.metrics2.MetricsExecutor + + + + org.apache.hadoop.metrics2.impl.JmxCacheBuster + org.apache.hudi.org.apache.hadoop.metrics2.impl.JmxCacheBuster + + + org.apache.hadoop.metrics2.lib.DefaultMetricsSystemHelper + org.apache.hudi.org.apache.hadoop.metrics2.lib.DefaultMetricsSystemHelper + + + + org.apache.hadoop.metrics2.lib.DynamicMetricsRegistry + org.apache.hudi.org.apache.hadoop.metrics2.lib.DynamicMetricsRegistry + + + + org.apache.hadoop.metrics2.lib.MetricsExecutorImpl + org.apache.hudi.org.apache.hadoop.metrics2.lib.MetricsExecutorImpl + + + + org.apache.hadoop.metrics2.lib.MutableFastCounter + org.apache.hudi.org.apache.hadoop.metrics2.lib.MutableFastCounter + + + + org.apache.hadoop.metrics2.lib.MutableHistogram + org.apache.hudi.org.apache.hadoop.metrics2.lib.MutableHistogram + + + + org.apache.hadoop.metrics2.lib.MutableRangeHistogram + org.apache.hudi.org.apache.hadoop.metrics2.lib.MutableRangeHistogram + + + + org.apache.hadoop.metrics2.lib.MutableSizeHistogram + org.apache.hudi.org.apache.hadoop.metrics2.lib.MutableSizeHistogram + + + + org.apache.hadoop.metrics2.lib.MutableTimeHistogram + org.apache.hudi.org.apache.hadoop.metrics2.lib.MutableTimeHistogram + + + + org.apache.hadoop.metrics2.util.MetricQuantile + org.apache.hudi.org.apache.hadoop.metrics2.util.MetricQuantile + + + + org.apache.hadoop.metrics2.util.MetricSampleQuantiles + org.apache.hudi.org.apache.hadoop.metrics2.util.MetricSampleQuantiles + + false @@ -120,6 +209,8 @@ META-INF/*.DSA META-INF/*.RSA META-INF/services/javax.* + **/*.proto + hbase-webapps/** @@ -131,7 +222,7 @@ - src/main/resources + ../../hudi-common/src/main/resour src/test/resources @@ -168,47 +259,20 @@ compile - - org.apache.htrace - htrace-core - ${htrace.version} - compile - - - org.apache.hbase - hbase-common + hbase-hadoop-compat ${hbase.version} - org.apache.hbase - hbase-server + hbase-hadoop2-compat ${hbase.version} - compile - - - org.apache.hbase - hbase-common - - - javax.servlet - * - - - org.codehaus.jackson - * - - - org.mortbay.jetty - * - - - tomcat - * - - + + + org.apache.hbase.thirdparty + hbase-shaded-protobuf + ${hbase-thirdparty.version} diff --git a/packaging/hudi-integ-test-bundle/pom.xml b/packaging/hudi-integ-test-bundle/pom.xml index b53e02aaf776..e11212a4a34a 100644 --- a/packaging/hudi-integ-test-bundle/pom.xml +++ b/packaging/hudi-integ-test-bundle/pom.xml @@ -62,6 +62,9 @@ META-INF/services/org.apache.spark.sql.sources.DataSourceRegister + + hbase-default.xml + @@ -133,7 +136,6 @@ org.apache.hive:hive-common org.apache.hive:hive-service - org.apache.hive:hive-metastore org.apache.hive:hive-jdbc org.apache.hive:hive-exec @@ -398,6 +400,12 @@ hive-metastore ${hive.version} provided + + + org.apache.hbase + * + + diff --git a/packaging/hudi-presto-bundle/pom.xml b/packaging/hudi-presto-bundle/pom.xml index 90c1087dcb4d..51d24987df37 100644 --- a/packaging/hudi-presto-bundle/pom.xml +++ b/packaging/hudi-presto-bundle/pom.xml @@ -61,6 +61,7 @@ META-INF/LICENSE target/classes/META-INF/LICENSE + @@ -75,20 +76,49 @@ com.esotericsoftware:minlog org.apache.hbase:hbase-common org.apache.hbase:hbase-client + org.apache.hbase:hbase-hadoop-compat + org.apache.hbase:hbase-hadoop2-compat + org.apache.hbase:hbase-metrics + org.apache.hbase:hbase-metrics-api org.apache.hbase:hbase-protocol - org.apache.hbase:hbase-server - org.apache.htrace:htrace-core + org.apache.hbase:hbase-protocol-shaded + org.apache.hbase:hbase-annotations + org.apache.hbase.thirdparty:hbase-shaded-miscellaneous + org.apache.hbase.thirdparty:hbase-shaded-netty + org.apache.hbase.thirdparty:hbase-shaded-protobuf + org.apache.htrace:htrace-core4 com.yammer.metrics:metrics-core com.google.guava:guava + commons-io:commons-io commons-lang:commons-lang com.google.protobuf:protobuf-java + + org.apache.parquet.avro. + org.apache.hudi.org.apache.parquet.avro. + org.apache.avro. org.apache.hudi.org.apache.avro. + + org.apache.commons.io. + org.apache.hudi.org.apache.commons.io. + + + org.apache.hadoop.hbase. + org.apache.hudi.org.apache.hadoop.hbase. + + + org.apache.hbase. + org.apache.hudi.org.apache.hbase. + + + org.apache.htrace. + org.apache.hudi.org.apache.htrace. + org.codehaus.jackson. org.apache.hudi.org.codehaus.jackson. @@ -121,14 +151,74 @@ com.google.protobuf. ${presto.bundle.bootstrap.shade.prefix}com.google.protobuf. - - org.apache.htrace. - ${presto.bundle.bootstrap.shade.prefix}org.apache.htrace. - org.apache.parquet.avro. ${presto.bundle.bootstrap.shade.prefix}org.apache.parquet.avro. + + org.apache.hadoop.metrics2.MetricHistogram + org.apache.hudi.org.apache.hadoop.metrics2.MetricHistogram + + + + org.apache.hadoop.metrics2.MetricsExecutor + org.apache.hudi.org.apache.hadoop.metrics2.MetricsExecutor + + + + org.apache.hadoop.metrics2.impl.JmxCacheBuster + org.apache.hudi.org.apache.hadoop.metrics2.impl.JmxCacheBuster + + + org.apache.hadoop.metrics2.lib.DefaultMetricsSystemHelper + org.apache.hudi.org.apache.hadoop.metrics2.lib.DefaultMetricsSystemHelper + + + + org.apache.hadoop.metrics2.lib.DynamicMetricsRegistry + org.apache.hudi.org.apache.hadoop.metrics2.lib.DynamicMetricsRegistry + + + + org.apache.hadoop.metrics2.lib.MetricsExecutorImpl + org.apache.hudi.org.apache.hadoop.metrics2.lib.MetricsExecutorImpl + + + + org.apache.hadoop.metrics2.lib.MutableFastCounter + org.apache.hudi.org.apache.hadoop.metrics2.lib.MutableFastCounter + + + + org.apache.hadoop.metrics2.lib.MutableHistogram + org.apache.hudi.org.apache.hadoop.metrics2.lib.MutableHistogram + + + + org.apache.hadoop.metrics2.lib.MutableRangeHistogram + org.apache.hudi.org.apache.hadoop.metrics2.lib.MutableRangeHistogram + + + + org.apache.hadoop.metrics2.lib.MutableSizeHistogram + org.apache.hudi.org.apache.hadoop.metrics2.lib.MutableSizeHistogram + + + + org.apache.hadoop.metrics2.lib.MutableTimeHistogram + org.apache.hudi.org.apache.hadoop.metrics2.lib.MutableTimeHistogram + + + + org.apache.hadoop.metrics2.util.MetricQuantile + org.apache.hudi.org.apache.hadoop.metrics2.util.MetricQuantile + + + + org.apache.hadoop.metrics2.util.MetricSampleQuantiles + org.apache.hudi.org.apache.hadoop.metrics2.util.MetricSampleQuantiles + + false @@ -138,8 +228,15 @@ META-INF/*.SF META-INF/*.DSA META-INF/*.RSA + META-INF/maven/com.google.protobuf/** + META-INF/maven/commons-io/** + META-INF/maven/org.apache.hbase/** + META-INF/maven/org.apache.hbase.thirdparty/** + META-INF/maven/org.apache.htrace/** META-INF/services/javax.* + **/*.proto com/esotericsoftware/reflectasm/** + hbase-webapps/** stringBehavior.avsc diff --git a/packaging/hudi-spark-bundle/pom.xml b/packaging/hudi-spark-bundle/pom.xml index a877d10a586a..54eb2a358f0e 100644 --- a/packaging/hudi-spark-bundle/pom.xml +++ b/packaging/hudi-spark-bundle/pom.xml @@ -63,6 +63,7 @@ META-INF/services/org.apache.spark.sql.sources.DataSourceRegister + @@ -116,13 +117,21 @@ org.apache.hbase:hbase-client org.apache.hbase:hbase-common - org.apache.hbase:hbase-protocol + org.apache.hbase:hbase-hadoop-compat + org.apache.hbase:hbase-hadoop2-compat + org.apache.hbase:hbase-metrics + org.apache.hbase:hbase-metrics-api + org.apache.hbase:hbase-protocol-shaded org.apache.hbase:hbase-server - org.apache.htrace:htrace-core + org.apache.hbase.thirdparty:hbase-shaded-miscellaneous + org.apache.hbase.thirdparty:hbase-shaded-netty + org.apache.hbase.thirdparty:hbase-shaded-protobuf + org.apache.htrace:htrace-core4 org.apache.curator:curator-framework org.apache.curator:curator-client org.apache.curator:curator-recipes commons-codec:commons-codec + commons-io:commons-io @@ -134,6 +143,22 @@ com.beust.jcommander. org.apache.hudi.com.beust.jcommander. + + org.apache.commons.io. + org.apache.hudi.org.apache.commons.io. + + + org.apache.hadoop.hbase. + org.apache.hudi.org.apache.hadoop.hbase. + + + org.apache.hbase. + org.apache.hudi.org.apache.hbase. + + + org.apache.htrace. + org.apache.hudi.org.apache.htrace. + org.apache.spark.sql.avro. ${spark.bundle.spark.shade.prefix}org.apache.spark.sql.avro. @@ -183,6 +208,70 @@ ${spark.bundle.spark.shade.prefix}com.google.common. + + org.apache.hadoop.metrics2.MetricHistogram + org.apache.hudi.org.apache.hadoop.metrics2.MetricHistogram + + + + org.apache.hadoop.metrics2.MetricsExecutor + org.apache.hudi.org.apache.hadoop.metrics2.MetricsExecutor + + + + org.apache.hadoop.metrics2.impl.JmxCacheBuster + org.apache.hudi.org.apache.hadoop.metrics2.impl.JmxCacheBuster + + + org.apache.hadoop.metrics2.lib.DefaultMetricsSystemHelper + org.apache.hudi.org.apache.hadoop.metrics2.lib.DefaultMetricsSystemHelper + + + + org.apache.hadoop.metrics2.lib.DynamicMetricsRegistry + org.apache.hudi.org.apache.hadoop.metrics2.lib.DynamicMetricsRegistry + + + + org.apache.hadoop.metrics2.lib.MetricsExecutorImpl + org.apache.hudi.org.apache.hadoop.metrics2.lib.MetricsExecutorImpl + + + + org.apache.hadoop.metrics2.lib.MutableFastCounter + org.apache.hudi.org.apache.hadoop.metrics2.lib.MutableFastCounter + + + + org.apache.hadoop.metrics2.lib.MutableHistogram + org.apache.hudi.org.apache.hadoop.metrics2.lib.MutableHistogram + + + + org.apache.hadoop.metrics2.lib.MutableRangeHistogram + org.apache.hudi.org.apache.hadoop.metrics2.lib.MutableRangeHistogram + + + + org.apache.hadoop.metrics2.lib.MutableSizeHistogram + org.apache.hudi.org.apache.hadoop.metrics2.lib.MutableSizeHistogram + + + + org.apache.hadoop.metrics2.lib.MutableTimeHistogram + org.apache.hudi.org.apache.hadoop.metrics2.lib.MutableTimeHistogram + + + + org.apache.hadoop.metrics2.util.MetricQuantile + org.apache.hudi.org.apache.hadoop.metrics2.util.MetricQuantile + + + + org.apache.hadoop.metrics2.util.MetricSampleQuantiles + org.apache.hudi.org.apache.hadoop.metrics2.util.MetricSampleQuantiles + + @@ -191,7 +280,14 @@ META-INF/*.SF META-INF/*.DSA META-INF/*.RSA + META-INF/maven/com.google.protobuf/** + META-INF/maven/commons-io/** + META-INF/maven/org.apache.hbase/** + META-INF/maven/org.apache.hbase.thirdparty/** + META-INF/maven/org.apache.htrace/** META-INF/services/javax.* + **/*.proto + hbase-webapps/** @@ -333,6 +429,10 @@ ${hbase.version} compile + + guava + com.google.guava + org.apache.hbase hbase-common @@ -362,9 +462,34 @@ org.apache.hbase - hbase-protocol + hbase-protocol-shaded ${hbase.version} + + org.apache.hbase.thirdparty + hbase-shaded-miscellaneous + ${hbase-thirdparty.version} + + + org.apache.hbase.thirdparty + hbase-shaded-netty + ${hbase-thirdparty.version} + + + org.apache.hbase + hbase-hadoop-compat + ${hbase.version} + + + org.apache.hbase + hbase-hadoop2-compat + ${hbase.version} + + + org.apache.hbase.thirdparty + hbase-shaded-protobuf + ${hbase-thirdparty.version} + diff --git a/packaging/hudi-trino-bundle/pom.xml b/packaging/hudi-trino-bundle/pom.xml index adf73f1bb0b8..7deced3a323d 100644 --- a/packaging/hudi-trino-bundle/pom.xml +++ b/packaging/hudi-trino-bundle/pom.xml @@ -62,6 +62,7 @@ META-INF/LICENSE target/classes/META-INF/LICENSE + @@ -76,22 +77,49 @@ com.esotericsoftware:minlog org.apache.hbase:hbase-common org.apache.hbase:hbase-client - org.apache.hbase:hbase-protocol + org.apache.hbase:hbase-hadoop-compat + org.apache.hbase:hbase-hadoop2-compat + org.apache.hbase:hbase-metrics + org.apache.hbase:hbase-metrics-api + org.apache.hbase:hbase-protocol-shaded org.apache.hbase:hbase-server org.apache.hbase:hbase-annotations - org.apache.htrace:htrace-core + org.apache.hbase.thirdparty:hbase-shaded-protobuf + org.apache.hbase.thirdparty:hbase-shaded-netty + org.apache.hbase.thirdparty:hbase-shaded-miscellaneous + org.apache.htrace:htrace-core4 com.yammer.metrics:metrics-core com.google.guava:guava commons-lang:commons-lang + commons-io:commons-io com.google.protobuf:protobuf-java - + + org.apache.parquet.avro. + org.apache.hudi.org.apache.parquet.avro. + org.apache.avro. org.apache.hudi.org.apache.avro. + + org.apache.commons.io. + org.apache.hudi.org.apache.commons.io. + + + org.apache.hadoop.hbase. + org.apache.hudi.org.apache.hadoop.hbase. + + + org.apache.hbase. + org.apache.hudi.org.apache.hbase. + + + org.apache.htrace. + org.apache.hudi.org.apache.htrace. + org.codehaus.jackson. org.apache.hudi.org.codehaus.jackson. @@ -124,6 +152,70 @@ com.google.protobuf. ${trino.bundle.bootstrap.shade.prefix}com.google.protobuf. + + org.apache.hadoop.metrics2.MetricHistogram + org.apache.hudi.org.apache.hadoop.metrics2.MetricHistogram + + + + org.apache.hadoop.metrics2.MetricsExecutor + org.apache.hudi.org.apache.hadoop.metrics2.MetricsExecutor + + + + org.apache.hadoop.metrics2.impl.JmxCacheBuster + org.apache.hudi.org.apache.hadoop.metrics2.impl.JmxCacheBuster + + + org.apache.hadoop.metrics2.lib.DefaultMetricsSystemHelper + org.apache.hudi.org.apache.hadoop.metrics2.lib.DefaultMetricsSystemHelper + + + + org.apache.hadoop.metrics2.lib.DynamicMetricsRegistry + org.apache.hudi.org.apache.hadoop.metrics2.lib.DynamicMetricsRegistry + + + + org.apache.hadoop.metrics2.lib.MetricsExecutorImpl + org.apache.hudi.org.apache.hadoop.metrics2.lib.MetricsExecutorImpl + + + + org.apache.hadoop.metrics2.lib.MutableFastCounter + org.apache.hudi.org.apache.hadoop.metrics2.lib.MutableFastCounter + + + + org.apache.hadoop.metrics2.lib.MutableHistogram + org.apache.hudi.org.apache.hadoop.metrics2.lib.MutableHistogram + + + + org.apache.hadoop.metrics2.lib.MutableRangeHistogram + org.apache.hudi.org.apache.hadoop.metrics2.lib.MutableRangeHistogram + + + + org.apache.hadoop.metrics2.lib.MutableSizeHistogram + org.apache.hudi.org.apache.hadoop.metrics2.lib.MutableSizeHistogram + + + + org.apache.hadoop.metrics2.lib.MutableTimeHistogram + org.apache.hudi.org.apache.hadoop.metrics2.lib.MutableTimeHistogram + + + + org.apache.hadoop.metrics2.util.MetricQuantile + org.apache.hudi.org.apache.hadoop.metrics2.util.MetricQuantile + + + + org.apache.hadoop.metrics2.util.MetricSampleQuantiles + org.apache.hudi.org.apache.hadoop.metrics2.util.MetricSampleQuantiles + + false @@ -133,7 +225,14 @@ META-INF/*.SF META-INF/*.DSA META-INF/*.RSA + META-INF/maven/com.google.protobuf/** + META-INF/maven/commons-io/** + META-INF/maven/org.apache.hbase/** + META-INF/maven/org.apache.hbase.thirdparty/** + META-INF/maven/org.apache.htrace/** META-INF/services/javax.* + **/*.proto + hbase-webapps/** @@ -159,16 +258,6 @@ org.apache.hudi hudi-common ${project.version} - - - org.apache.hbase - hbase-server - - - org.apache.hbase - hbase-client - - org.apache.hudi diff --git a/packaging/hudi-utilities-bundle/pom.xml b/packaging/hudi-utilities-bundle/pom.xml index 1ffca7634a1f..5e0366abfeed 100644 --- a/packaging/hudi-utilities-bundle/pom.xml +++ b/packaging/hudi-utilities-bundle/pom.xml @@ -86,6 +86,7 @@ META-INF/services/org.apache.spark.sql.sources.DataSourceRegister + @@ -150,13 +151,21 @@ org.apache.hbase:hbase-client org.apache.hbase:hbase-common - org.apache.hbase:hbase-protocol + org.apache.hbase:hbase-hadoop-compat + org.apache.hbase:hbase-hadoop2-compat + org.apache.hbase:hbase-metrics + org.apache.hbase:hbase-metrics-api + org.apache.hbase:hbase-protocol-shaded org.apache.hbase:hbase-server - org.apache.htrace:htrace-core + org.apache.hbase.thirdparty:hbase-shaded-miscellaneous + org.apache.hbase.thirdparty:hbase-shaded-netty + org.apache.hbase.thirdparty:hbase-shaded-protobuf + org.apache.htrace:htrace-core4 org.apache.curator:curator-framework org.apache.curator:curator-client org.apache.curator:curator-recipes commons-codec:commons-codec + commons-io:commons-io @@ -172,6 +181,22 @@ org.apache.hive.jdbc. ${utilities.bundle.hive.shade.prefix}org.apache.hive.jdbc. + + org.apache.commons.io. + org.apache.hudi.org.apache.commons.io. + + + org.apache.hadoop.hbase. + org.apache.hudi.org.apache.hadoop.hbase. + + + org.apache.hbase. + org.apache.hudi.org.apache.hbase. + + + org.apache.htrace. + org.apache.hudi.org.apache.htrace. + org.apache.hadoop.hive.metastore. ${utilities.bundle.hive.shade.prefix}org.apache.hadoop.hive.metastore. @@ -208,6 +233,70 @@ org.eclipse.jetty. org.apache.hudi.org.eclipse.jetty. + + org.apache.hadoop.metrics2.MetricHistogram + org.apache.hudi.org.apache.hadoop.metrics2.MetricHistogram + + + + org.apache.hadoop.metrics2.MetricsExecutor + org.apache.hudi.org.apache.hadoop.metrics2.MetricsExecutor + + + + org.apache.hadoop.metrics2.impl.JmxCacheBuster + org.apache.hudi.org.apache.hadoop.metrics2.impl.JmxCacheBuster + + + org.apache.hadoop.metrics2.lib.DefaultMetricsSystemHelper + org.apache.hudi.org.apache.hadoop.metrics2.lib.DefaultMetricsSystemHelper + + + + org.apache.hadoop.metrics2.lib.DynamicMetricsRegistry + org.apache.hudi.org.apache.hadoop.metrics2.lib.DynamicMetricsRegistry + + + + org.apache.hadoop.metrics2.lib.MetricsExecutorImpl + org.apache.hudi.org.apache.hadoop.metrics2.lib.MetricsExecutorImpl + + + + org.apache.hadoop.metrics2.lib.MutableFastCounter + org.apache.hudi.org.apache.hadoop.metrics2.lib.MutableFastCounter + + + + org.apache.hadoop.metrics2.lib.MutableHistogram + org.apache.hudi.org.apache.hadoop.metrics2.lib.MutableHistogram + + + + org.apache.hadoop.metrics2.lib.MutableRangeHistogram + org.apache.hudi.org.apache.hadoop.metrics2.lib.MutableRangeHistogram + + + + org.apache.hadoop.metrics2.lib.MutableSizeHistogram + org.apache.hudi.org.apache.hadoop.metrics2.lib.MutableSizeHistogram + + + + org.apache.hadoop.metrics2.lib.MutableTimeHistogram + org.apache.hudi.org.apache.hadoop.metrics2.lib.MutableTimeHistogram + + + + org.apache.hadoop.metrics2.util.MetricQuantile + org.apache.hudi.org.apache.hadoop.metrics2.util.MetricQuantile + + + + org.apache.hadoop.metrics2.util.MetricSampleQuantiles + org.apache.hudi.org.apache.hadoop.metrics2.util.MetricSampleQuantiles + + @@ -216,7 +305,14 @@ META-INF/*.SF META-INF/*.DSA META-INF/*.RSA + META-INF/maven/com.google.protobuf/** + META-INF/maven/commons-io/** + META-INF/maven/org.apache.hbase/** + META-INF/maven/org.apache.hbase.thirdparty/** + META-INF/maven/org.apache.htrace/** META-INF/services/javax.* + **/*.proto + hbase-webapps/** @@ -345,6 +441,12 @@ org.apache.hbase hbase-common ${hbase.version} + + + guava + com.google.guava + + org.apache.hbase @@ -352,6 +454,10 @@ ${hbase.version} compile + + guava + com.google.guava + org.apache.hbase hbase-common @@ -381,9 +487,29 @@ org.apache.hbase - hbase-protocol + hbase-protocol-shaded + ${hbase.version} + + + org.apache.hbase.thirdparty + hbase-shaded-miscellaneous + ${hbase-thirdparty.version} + + + org.apache.hbase.thirdparty + hbase-shaded-netty + ${hbase-thirdparty.version} + + + org.apache.hbase + hbase-hadoop-compat ${hbase.version} + + org.apache.hbase.thirdparty + hbase-shaded-protobuf + ${hbase-thirdparty.version} + diff --git a/pom.xml b/pom.xml index 33bd112a1bfe..ec0699aff197 100644 --- a/pom.xml +++ b/pom.xml @@ -103,7 +103,7 @@ 2.17.0 1.7.30 2.9.9 - 2.7.3 + 2.10.1 org.apache.hive 2.3.1 core @@ -131,7 +131,8 @@ 0.12.0 9.4.15.v20190215 3.1.0-incubating - 1.2.3 + 2.4.9 + 3.5.1 1.9.13 1.4.199 3.1.2 @@ -163,6 +164,7 @@ 3.17.3 3.11.4 1.1.0 + 3.5.7 8000 http://localhost:${dynamodb-local.port} @@ -1522,7 +1524,7 @@ https://docs.spring.io/spring-shell/docs/1.2.0.RELEASE https://fasterxml.github.io/jackson-databind/javadoc/2.6 https://hadoop.apache.org/docs/r${hadoop.version}/api - https://hbase.apache.org/1.2/apidocs + https://hbase.apache.org/2.4/apidocs https://hive.apache.org/javadocs/r2.3.6/api https://javadoc.io/static/io.javalin/javalin/2.3.0 https://javadoc.io/doc/org.apache.parquet/parquet-avro/${parquet.version}