Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@
import com.facebook.presto.spi.TableNotFoundException;
import com.facebook.presto.spi.connector.ConnectorMetadata;
import com.facebook.presto.spi.connector.ConnectorOutputMetadata;
import com.facebook.presto.spi.statistics.ComputedStatistics;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.ImmutableMap;
import com.google.common.collect.ImmutableSet;
Expand Down Expand Up @@ -102,7 +103,7 @@ public ConnectorOutputTableHandle beginCreateTable(ConnectorSession session, Con
}

@Override
public Optional<ConnectorOutputMetadata> finishCreateTable(ConnectorSession session, ConnectorOutputTableHandle tableHandle, Collection<Slice> fragments)
public Optional<ConnectorOutputMetadata> finishCreateTable(ConnectorSession session, ConnectorOutputTableHandle tableHandle, Collection<Slice> fragments, List<ComputedStatistics> computedStatistics)
{
clearRollback();
return Optional.empty();
Expand Down Expand Up @@ -212,7 +213,7 @@ public ConnectorInsertTableHandle beginInsert(ConnectorSession session, Connecto
}

@Override
public Optional<ConnectorOutputMetadata> finishInsert(ConnectorSession session, ConnectorInsertTableHandle insertHandle, Collection<Slice> fragments)
public Optional<ConnectorOutputMetadata> finishInsert(ConnectorSession session, ConnectorInsertTableHandle insertHandle, Collection<Slice> fragments, List<ComputedStatistics> computedStatistics)
{
clearRollback();
return Optional.empty();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
import com.facebook.presto.spi.TableNotFoundException;
import com.facebook.presto.spi.connector.ConnectorMetadata;
import com.facebook.presto.spi.connector.ConnectorOutputMetadata;
import com.facebook.presto.spi.statistics.ComputedStatistics;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.ImmutableMap;
import io.airlift.slice.Slice;
Expand Down Expand Up @@ -173,7 +174,7 @@ public ConnectorOutputTableHandle beginCreateTable(ConnectorSession session, Con
}

@Override
public Optional<ConnectorOutputMetadata> finishCreateTable(ConnectorSession session, ConnectorOutputTableHandle tableHandle, Collection<Slice> fragments)
public Optional<ConnectorOutputMetadata> finishCreateTable(ConnectorSession session, ConnectorOutputTableHandle tableHandle, Collection<Slice> fragments, List<ComputedStatistics> computedStatistics)
{
JdbcOutputTableHandle handle = (JdbcOutputTableHandle) tableHandle;
jdbcClient.commitCreateTable(handle);
Expand Down Expand Up @@ -203,7 +204,7 @@ public ConnectorInsertTableHandle beginInsert(ConnectorSession session, Connecto
}

@Override
public Optional<ConnectorOutputMetadata> finishInsert(ConnectorSession session, ConnectorInsertTableHandle tableHandle, Collection<Slice> fragments)
public Optional<ConnectorOutputMetadata> finishInsert(ConnectorSession session, ConnectorInsertTableHandle tableHandle, Collection<Slice> fragments, List<ComputedStatistics> computedStatistics)
{
JdbcOutputTableHandle jdbcInsertHandle = (JdbcOutputTableHandle) tableHandle;
jdbcClient.finishInsertTable(jdbcInsertHandle);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
import com.facebook.presto.spi.SchemaTablePrefix;
import com.facebook.presto.spi.connector.ConnectorMetadata;
import com.facebook.presto.spi.connector.ConnectorOutputMetadata;
import com.facebook.presto.spi.statistics.ComputedStatistics;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.ImmutableSet;
import com.google.common.collect.Sets;
Expand Down Expand Up @@ -159,7 +160,7 @@ public void renameTable(ConnectorSession session, ConnectorTableHandle tableHand
public void createTable(ConnectorSession session, ConnectorTableMetadata tableMetadata, boolean ignoreExisting)
{
ConnectorOutputTableHandle outputTableHandle = beginCreateTable(session, tableMetadata, Optional.empty());
finishCreateTable(session, outputTableHandle, ImmutableList.of());
finishCreateTable(session, outputTableHandle, ImmutableList.of(), ImmutableList.of());
}

@Override
Expand Down Expand Up @@ -220,7 +221,7 @@ public ConnectorOutputTableHandle beginCreateTable(ConnectorSession session, Con
}

@Override
public Optional<ConnectorOutputMetadata> finishCreateTable(ConnectorSession session, ConnectorOutputTableHandle tableHandle, Collection<Slice> fragments)
public Optional<ConnectorOutputMetadata> finishCreateTable(ConnectorSession session, ConnectorOutputTableHandle tableHandle, Collection<Slice> fragments, List<ComputedStatistics> computedStatistics)
{
BlackHoleOutputTableHandle blackHoleOutputTableHandle = (BlackHoleOutputTableHandle) tableHandle;
BlackHoleTableHandle table = blackHoleOutputTableHandle.getTable();
Expand All @@ -236,7 +237,7 @@ public ConnectorInsertTableHandle beginInsert(ConnectorSession session, Connecto
}

@Override
public Optional<ConnectorOutputMetadata> finishInsert(ConnectorSession session, ConnectorInsertTableHandle insertHandle, Collection<Slice> fragments)
public Optional<ConnectorOutputMetadata> finishInsert(ConnectorSession session, ConnectorInsertTableHandle insertHandle, Collection<Slice> fragments, List<ComputedStatistics> computedStatistics)
{
return Optional.empty();
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ public void tableIsCreatedAfterCommits()

assertThatNoTableIsCreated();

metadata.finishCreateTable(SESSION, table, ImmutableList.of());
metadata.finishCreateTable(SESSION, table, ImmutableList.of(), ImmutableList.of());

List<SchemaTableName> tables = metadata.listTables(SESSION, null);
assertTrue(tables.size() == 1, "Expected only one table.");
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@
import com.facebook.presto.spi.connector.ConnectorMetadata;
import com.facebook.presto.spi.connector.ConnectorOutputMetadata;
import com.facebook.presto.spi.predicate.TupleDomain;
import com.facebook.presto.spi.statistics.ComputedStatistics;
import com.facebook.presto.spi.type.Type;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.ImmutableMap;
Expand Down Expand Up @@ -312,7 +313,7 @@ public ConnectorOutputTableHandle beginCreateTable(ConnectorSession session, Con
}

@Override
public Optional<ConnectorOutputMetadata> finishCreateTable(ConnectorSession session, ConnectorOutputTableHandle tableHandle, Collection<Slice> fragments)
public Optional<ConnectorOutputMetadata> finishCreateTable(ConnectorSession session, ConnectorOutputTableHandle tableHandle, Collection<Slice> fragments, List<ComputedStatistics> computedStatistics)
{
return Optional.empty();
}
Expand All @@ -339,7 +340,7 @@ public ConnectorInsertTableHandle beginInsert(ConnectorSession session, Connecto
}

@Override
public Optional<ConnectorOutputMetadata> finishInsert(ConnectorSession session, ConnectorInsertTableHandle insertHandle, Collection<Slice> fragments)
public Optional<ConnectorOutputMetadata> finishInsert(ConnectorSession session, ConnectorInsertTableHandle insertHandle, Collection<Slice> fragments, List<ComputedStatistics> computedStatistics)
{
return Optional.empty();
}
Expand Down
63 changes: 60 additions & 3 deletions presto-docs/src/main/sphinx/connector/hive.rst
Original file line number Diff line number Diff line change
Expand Up @@ -111,9 +111,9 @@ security options in the Hive connector.
Hive Configuration Properties
-----------------------------

================================================== ============================================================ ==========
================================================== ============================================================ =============================
Property Name Description Default
================================================== ============================================================ ==========
================================================== ============================================================ =============================
``hive.metastore.uri`` The URI(s) of the Hive metastore to connect to using the
Thrift protocol. If multiple URIs are provided, the first
URI is used by default and the rest of the URIs are
Expand Down Expand Up @@ -175,7 +175,11 @@ Property Name Description
``hive.non-managed-table-writes-enabled`` Enable writes to non-managed (external) Hive tables. ``false``

``hive.non-managed-table-creates-enabled`` Enable creating non-managed (external) Hive tables. ``true``
================================================== ============================================================ ==========

``hive.collect-column-statistics-on-write`` Enables automatic column level statistics collection ``ENABLED_FOR_MARKED_TABLES``
on write. Possible values are ``ENABLED``,
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Let's add another sentence:

See `Table Statistics <#table-statistics>`__ for details.

(This syntax should work. I copied it from accumulo.rst)

``ENABLED_FOR_MARKED_TABLES`` or ``DISABLED``
================================================== ============================================================ =============================

Amazon S3 Configuration
-----------------------
Expand Down Expand Up @@ -334,6 +338,59 @@ the ``org.apache.hadoop.conf.Configurable`` interface from the Hadoop Java API,
will be passed in after the object instance is created and before it is asked to provision or retrieve any
encryption keys.

Table Statistics
----------------

The Hive connector collects ``numRows``, ``rawDataSize``, ``totalSize``, ``numFiles`` statistics
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

How about

The Hive connector automatically collects basic statistics
(``numFiles', ``numRows``, ``rawDataSize``, ``totalSize``)
on ``INSERT`` and ``CREATE TABLE AS`` operations.

I shortened the names to match what we call them in the SQL Statement Syntax documentation.

automatically on ``INSERT INTO`` and ``CREATE TABLE AS SELECT`` operations.

The Hive connector can also collect the column level statistics:
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Remove "the"

The Hive connector can also collect column level statistics:


============= ================================================================================================================
Column Type Collectible Statistics
============= ================================================================================================================
``TINYINT`` ``NUMBER_OF_NULLS``, ``MIN``, ``MAX``, ``NUMBER_OF_DISTINCT_VALUES``

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Let's remove all the blank lines between rows, since each row is only one line. That should make it a bit easier to read.

``SMALLINT`` ``NUMBER_OF_NULLS``, ``MIN``, ``MAX``, ``NUMBER_OF_DISTINCT_VALUES``

``INTEGER`` ``NUMBER_OF_NULLS``, ``MIN``, ``MAX``, ``NUMBER_OF_DISTINCT_VALUES``

``BIGINT`` ``NUMBER_OF_NULLS``, ``MIN``, ``MAX``, ``NUMBER_OF_DISTINCT_VALUES``

``DOUBLE`` ``NUMBER_OF_NULLS``, ``MIN``, ``MAX``, ``NUMBER_OF_DISTINCT_VALUES``

``REAL`` ``NUMBER_OF_NULLS``, ``MIN``, ``MAX``, ``NUMBER_OF_DISTINCT_VALUES``

``BOOLEAN`` ``NUMBER_OF_NULLS``, ``NUMBER_OF_FALSE``, ``NUMBER_OF_TRUE``

``VARCHAR`` ``NUMBER_OF_NULLS``, ``NUMBER_OF_DISTINCT_VALUES``, ``MAX_VALUE_SIZE_IN_BYTES``, ``AVERAGE_VALUE_SIZE_IN_BYTES``

``CHAR`` ``NUMBER_OF_NULLS``, ``NUMBER_OF_DISTINCT_VALUES``

``VARBINARY`` ``NUMBER_OF_NULLS``, ``MAX_VALUE_SIZE_IN_BYTES``, ``AVERAGE_VALUE_SIZE_IN_BYTES``

``DATE`` ``NUMBER_OF_NULLS``, ``MIN``, ``MAX``, ``NUMBER_OF_DISTINCT_VALUES``

``TIMESTAMP`` ``NUMBER_OF_NULLS``, ``MIN``, ``MAX``, ``NUMBER_OF_DISTINCT_VALUES``

``DECIMAL`` ``NUMBER_OF_NULLS``, ``MIN``, ``MAX``, ``NUMBER_OF_DISTINCT_VALUES``
============= ================================================================================================================

Automatic column level statistics collection on write can be enabled by tuning the ``hive.collect-column-statistics-on-write``
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Automatic column level statistics collection on write can be enabled using
the ``hive.collect-column-statistics-on-write`` property:

property:

* ``ENABLED``- Presto will collect the column level statistics for all the tables.
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

s/for all the tables/for all tables

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

How about

* ``ENABLED``: Always collect column level statistics when writing tables.

* ``ENABLED_FOR_MARKED_TABLES``: Collect column level statistics when writing
  to any table that was created with the ``collect_column_statistics_on_write``
  table property set to ``true``::

    CREATE TABLE automatically_collect_column_statistics (
        id bigint
    )
    WITH (collect_column_statistics_on_write_enabled = true)

* ``DISABLED``: Never collect column level statistics when writing tables.

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Removed the entire section. The property now takes a boolean. Boolean is pretty much self explanatory.

* ``ENABLED_FOR_MARKED_TABLES`` - Presto will collect the column level statistics for the tables
created with the ``collect_column_statistics_on_write_enabled`` set to ``true``:
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

s/with the/with

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Removed the section

::

CREATE TABLE automatically_collect_column_statistics (
a BIGINT
)
WITH (collect_column_statistics_on_write_enabled = true)

* ``DISABLED`` - Presto will not collect the column level statistics for any table.

Schema Evolution
----------------

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
/*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.facebook.presto.hive;

import com.facebook.presto.spi.statistics.ColumnStatisticType;
import com.facebook.presto.spi.type.Type;

import java.util.Set;

public interface CollectibleStatisticsProvider
{
Set<ColumnStatisticType> get(Type type);
}
Original file line number Diff line number Diff line change
Expand Up @@ -13,11 +13,6 @@
*/
package com.facebook.presto.hive;

import com.google.common.collect.ImmutableMap;

import javax.annotation.Nullable;

import java.util.Map;
import java.util.Objects;
import java.util.OptionalLong;

Expand All @@ -27,11 +22,6 @@

public class HiveBasicStatistics
{
private static final String NUM_FILES = "numFiles";
private static final String NUM_ROWS = "numRows";
private static final String RAW_DATA_SIZE = "rawDataSize";
private static final String TOTAL_SIZE = "totalSize";

private final OptionalLong fileCount;
private final OptionalLong rowCount;
private final OptionalLong inMemoryDataSizeInBytes;
Expand Down Expand Up @@ -120,40 +110,4 @@ public String toString()
.add("onDiskDataSizeInBytes", onDiskDataSizeInBytes)
.toString();
}

public Map<String, String> toPartitionParameters()
{
ImmutableMap.Builder<String, String> properties = ImmutableMap.builder();
fileCount.ifPresent(count -> properties.put(NUM_FILES, Long.toString(count)));
rowCount.ifPresent(count -> properties.put(NUM_ROWS, Long.toString(count)));
inMemoryDataSizeInBytes.ifPresent(size -> properties.put(RAW_DATA_SIZE, Long.toString(size)));
onDiskDataSizeInBytes.ifPresent(size -> properties.put(TOTAL_SIZE, Long.toString(size)));
return properties.build();
}

public static HiveBasicStatistics createFromPartitionParameters(Map<String, String> parameters)
{
OptionalLong numFiles = parse(parameters.get(NUM_FILES));
OptionalLong numRows = parse(parameters.get(NUM_ROWS));
OptionalLong inMemoryDataSizeInBytes = parse(parameters.get(RAW_DATA_SIZE));
OptionalLong onDiskDataSizeInBytes = parse(parameters.get(TOTAL_SIZE));
return new HiveBasicStatistics(numFiles, numRows, inMemoryDataSizeInBytes, onDiskDataSizeInBytes);
}

private static OptionalLong parse(@Nullable String parameterValue)
{
if (parameterValue == null) {
return OptionalLong.empty();
}
try {
long longValue = Long.parseLong(parameterValue);
if (longValue < 0) {
return OptionalLong.empty();
}
return OptionalLong.of(longValue);
}
catch (NumberFormatException e) {
return OptionalLong.empty();
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@
import java.util.TimeZone;
import java.util.concurrent.TimeUnit;

import static com.facebook.presto.hive.HiveClientConfig.CollectColumnStatisticsOnWriteOption.ENABLED_FOR_MARKED_TABLES;
import static io.airlift.units.DataSize.Unit.MEGABYTE;

@DefunctConfig({
Expand Down Expand Up @@ -134,6 +135,8 @@ public class HiveClientConfig

private boolean tableStatisticsEnabled = true;

private CollectColumnStatisticsOnWriteOption collectColumnStatisticsOnWrite = ENABLED_FOR_MARKED_TABLES;
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The enable-only-for-marked-tables feature seems like something transitional that we would later change to be always enabled. Does it make sense to split this into two booleans, so we can remove the marked table support later? Or do you expect we'll want that forever?

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Actually i think meanwhile the plan had changed, and we no longer need this. I will replace the enum with a simple boolean.


public int getMaxInitialSplits()
{
return maxInitialSplits;
Expand Down Expand Up @@ -1044,4 +1047,25 @@ public boolean isTableStatisticsEnabled()
{
return tableStatisticsEnabled;
}

@NotNull
public CollectColumnStatisticsOnWriteOption getCollectColumnStatisticsOnWrite()
{
return collectColumnStatisticsOnWrite;
}

@Config("hive.collect-column-statistics-on-write")
@ConfigDescription("Enables automatic column level statistics collection on write")
public HiveClientConfig setCollectColumnStatisticsOnWrite(CollectColumnStatisticsOnWriteOption collectColumnStatisticsOnWrite)
{
this.collectColumnStatisticsOnWrite = collectColumnStatisticsOnWrite;
return this;
}

public enum CollectColumnStatisticsOnWriteOption
{
ENABLED,
ENABLED_FOR_MARKED_TABLES,
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

add a javadoc linking to com.facebook.presto.hive.HiveTableProperties#COLLECT_COLUMN_STATISTICS_ON_WRITE_ENABLED

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I removed the enum

DISABLED
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

put DISABLED first, end all options with ,

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I removed the enum

}
}
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
import com.facebook.presto.hive.parquet.ParquetPageSourceFactory;
import com.facebook.presto.hive.parquet.ParquetRecordCursorProvider;
import com.facebook.presto.hive.rcfile.RcFilePageSourceFactory;
import com.facebook.presto.hive.util.Statistics;
import com.facebook.presto.spi.connector.ConnectorNodePartitioningProvider;
import com.facebook.presto.spi.connector.ConnectorPageSinkProvider;
import com.facebook.presto.spi.connector.ConnectorPageSourceProvider;
Expand Down Expand Up @@ -60,6 +61,7 @@ public void configure(Binder binder)
binder.bind(HiveConnectorId.class).toInstance(new HiveConnectorId(connectorId));
binder.bind(TypeTranslator.class).toInstance(new HiveTypeTranslator());
binder.bind(CoercionPolicy.class).to(HiveCoercionPolicy.class).in(Scopes.SINGLETON);
binder.bind(CollectibleStatisticsProvider.class).toInstance(Statistics::getSupportedStatistics);
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we should return nothing when glue metastore is used.

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm going to replace ExtendedHiveMetastore#supportsColumnStatistics with the ExtendedHiveMetastore#getSupportedColumnStatistics


binder.bind(HdfsConfigurationUpdater.class).in(Scopes.SINGLETON);
binder.bind(HdfsConfiguration.class).to(HiveHdfsConfiguration.class).in(Scopes.SINGLETON);
Expand Down
Loading