Skip to content
Merged
3 changes: 3 additions & 0 deletions presto-docs/src/main/sphinx/connector/hive.rst
Original file line number Diff line number Diff line change
Expand Up @@ -195,6 +195,9 @@ Property Name Description
S3SelectPushdown.

``hive.metastore.load-balancing-enabled`` Enable load balancing between multiple Metastore instances

``hive.skip-empty-files`` Enable skipping empty files. Otherwise, it will produce an ``false``
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

a false
explicitly specify the default value of this property
What is a "false error"?

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

screenshot of local docs build - the doc when built may answer all of these issues

Screenshot 2024-05-13 at 11 59 57 AM

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah, it's a table.

error iterating through empty files.
================================================== ============================================================ ============

Metastore Configuration Properties
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,8 @@ public Iterator<HiveFileInfo> list(
path,
p -> new HadoopFileInfoIterator(fileSystem.listLocatedStatus(p)),
namenodeStats,
hiveDirectoryContext.getNestedDirectoryPolicy());
hiveDirectoryContext.getNestedDirectoryPolicy(),
hiveDirectoryContext.isSkipEmptyFilesEnabled());
}

public static class HadoopFileInfoIterator
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -205,6 +205,8 @@ public class HiveClientConfig

private boolean partitionFilteringFromMetastoreEnabled = true;

private boolean skipEmptyFiles;

private boolean parallelParsingOfPartitionValuesEnabled;
private int maxParallelParsingConcurrency = 100;
private boolean quickStatsEnabled;
Expand Down Expand Up @@ -1832,4 +1834,17 @@ public HiveClientConfig setAffinitySchedulingFileSectionSize(DataSize affinitySc
this.affinitySchedulingFileSectionSize = affinitySchedulingFileSectionSize;
return this;
}

@Config("hive.skip-empty-files")
@ConfigDescription("Enables skip of empty files avoiding output error")
public HiveClientConfig setSkipEmptyFilesEnabled(boolean skipEmptyFiles)
{
this.skipEmptyFiles = skipEmptyFiles;
return this;
}

public boolean isSkipEmptyFilesEnabled()
{
return this.skipEmptyFiles;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -28,10 +28,12 @@ public class HiveDirectoryContext
private final Map<String, String> additionalProperties;
private final RuntimeStats runtimeStats;
private boolean cacheable;
private boolean skipEmptyFiles;

public HiveDirectoryContext(
NestedDirectoryPolicy nestedDirectoryPolicy,
boolean cacheable,
boolean skipEmptyFiles,
ConnectorIdentity connectorIdentity,
Map<String, String> additionalProperties,
RuntimeStats runtimeStats)
Expand All @@ -43,6 +45,7 @@ public HiveDirectoryContext(

// this can be disabled
this.cacheable = cacheable;
this.skipEmptyFiles = skipEmptyFiles;
}

public NestedDirectoryPolicy getNestedDirectoryPolicy()
Expand Down Expand Up @@ -70,6 +73,11 @@ public Map<String, String> getAdditionalProperties()
return additionalProperties;
}

public boolean isSkipEmptyFilesEnabled()
{
return skipEmptyFiles;
}

public RuntimeStats getRuntimeStats()
{
return runtimeStats;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -132,6 +132,7 @@ public final class HiveSessionProperties
public static final String QUICK_STATS_BACKGROUND_BUILD_TIMEOUT = "quick_stats_background_build_timeout";
public static final String DYNAMIC_SPLIT_SIZES_ENABLED = "dynamic_split_sizes_enabled";
public static final String AFFINITY_SCHEDULING_FILE_SECTION_SIZE = "affinity_scheduling_file_section_size";
public static final String SKIP_EMPTY_FILES = "skip_empty_files";

private final List<PropertyMetadata<?>> sessionProperties;

Expand Down Expand Up @@ -641,6 +642,11 @@ public HiveSessionProperties(HiveClientConfig hiveClientConfig, OrcFileWriterCon
AFFINITY_SCHEDULING_FILE_SECTION_SIZE,
"Size of file section for affinity scheduling",
hiveClientConfig.getAffinitySchedulingFileSectionSize(),
false),
booleanProperty(
SKIP_EMPTY_FILES,
"If it is required empty files will be skipped",
hiveClientConfig.isSkipEmptyFilesEnabled(),
false));
}

Expand Down Expand Up @@ -1118,4 +1124,9 @@ public static DataSize getAffinitySchedulingFileSectionSize(ConnectorSession ses
{
return session.getProperty(AFFINITY_SCHEDULING_FILE_SECTION_SIZE, DataSize.class);
}

public static boolean isSkipEmptyFilesEnabled(ConnectorSession session)
{
return session.getProperty(SKIP_EMPTY_FILES, Boolean.class);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,8 @@ public Iterator<HiveFileInfo> list(
table.getStorage().getLocation(),
p),
namenodeStats,
hiveDirectoryContext.getNestedDirectoryPolicy());
hiveDirectoryContext.getNestedDirectoryPolicy(),
hiveDirectoryContext.isSkipEmptyFilesEnabled());
}

public static class HudiFileInfoIterator
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@
import static com.facebook.presto.hive.HiveSessionProperties.getMaxInitialSplitSize;
import static com.facebook.presto.hive.HiveSessionProperties.getMaxSplitSize;
import static com.facebook.presto.hive.HiveSessionProperties.isManifestVerificationEnabled;
import static com.facebook.presto.hive.HiveSessionProperties.isSkipEmptyFilesEnabled;
import static com.facebook.presto.hive.HiveUtil.buildDirectoryContextProperties;
import static com.facebook.presto.hive.HiveUtil.getInputFormat;
import static com.facebook.presto.hive.NestedDirectoryPolicy.IGNORED;
Expand Down Expand Up @@ -191,6 +192,7 @@ private void validateManifest(ConnectorSession session, HivePartitionMetadata pa
HiveDirectoryContext hiveDirectoryContext = new HiveDirectoryContext(
recursiveDirWalkerEnabled ? RECURSE : IGNORED,
false,
isSkipEmptyFilesEnabled(session),
hdfsContext.getIdentity(),
buildDirectoryContextProperties(session),
session.getRuntimeStats());
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,7 @@
import static com.facebook.presto.hive.HiveSessionProperties.getMaxInitialSplitSize;
import static com.facebook.presto.hive.HiveSessionProperties.isFileSplittable;
import static com.facebook.presto.hive.HiveSessionProperties.isOrderBasedExecutionEnabled;
import static com.facebook.presto.hive.HiveSessionProperties.isSkipEmptyFilesEnabled;
import static com.facebook.presto.hive.HiveSessionProperties.isUseListDirectoryCache;
import static com.facebook.presto.hive.HiveUtil.buildDirectoryContextProperties;
import static com.facebook.presto.hive.HiveUtil.getFooterCount;
Expand Down Expand Up @@ -378,6 +379,7 @@ private Iterator<InternalHiveSplit> createInternalHiveSplitIterator(
HiveDirectoryContext hiveDirectoryContext = new HiveDirectoryContext(
recursiveDirWalkerEnabled ? RECURSE : IGNORED,
cacheable,
isSkipEmptyFilesEnabled(session),
hdfsContext.getIdentity(),
buildDirectoryContextProperties(session),
session.getRuntimeStats());
Expand Down Expand Up @@ -410,6 +412,7 @@ private List<InternalHiveSplit> getBucketedSplits(
Iterators.addAll(fileInfos, directoryLister.list(fileSystem, table, path, partition, namenodeStats, new HiveDirectoryContext(
FAIL,
isUseListDirectoryCache(session),
isSkipEmptyFilesEnabled(session),
hdfsContext.getIdentity(),
buildDirectoryContextProperties(session),
session.getRuntimeStats())));
Expand Down Expand Up @@ -559,6 +562,7 @@ private List<InternalHiveSplit> getVirtuallyBucketedSplits(Path path, ExtendedFi
HiveDirectoryContext hiveDirectoryContext = new HiveDirectoryContext(
recursiveDirWalkerEnabled ? RECURSE : IGNORED,
isUseListDirectoryCache(session),
isSkipEmptyFilesEnabled(session),
hdfsContext.getIdentity(),
buildDirectoryContextProperties(session),
session.getRuntimeStats());
Expand All @@ -579,6 +583,7 @@ private List<Path> getTargetPathsFromSymlink(ExtendedFileSystem fileSystem, Path
HiveDirectoryContext hiveDirectoryContext = new HiveDirectoryContext(
IGNORED,
isUseListDirectoryCache(session),
isSkipEmptyFilesEnabled(session),
hdfsContext.getIdentity(),
buildDirectoryContextProperties(session),
session.getRuntimeStats());
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,7 @@
import static com.facebook.presto.hive.HiveSessionProperties.getQuickStatsBackgroundBuildTimeout;
import static com.facebook.presto.hive.HiveSessionProperties.getQuickStatsInlineBuildTimeout;
import static com.facebook.presto.hive.HiveSessionProperties.isQuickStatsEnabled;
import static com.facebook.presto.hive.HiveSessionProperties.isSkipEmptyFilesEnabled;
import static com.facebook.presto.hive.HiveSessionProperties.isUseListDirectoryCache;
import static com.facebook.presto.hive.HiveUtil.buildDirectoryContextProperties;
import static com.facebook.presto.hive.NestedDirectoryPolicy.IGNORED;
Expand Down Expand Up @@ -329,7 +330,7 @@ private PartitionStatistics buildQuickStats(String partitionKey, String partitio

HdfsContext hdfsContext = new HdfsContext(session, table.getSchemaName(), table.getTableName(), partitionId, false);
HiveDirectoryContext hiveDirectoryContext = new HiveDirectoryContext(recursiveDirWalkerEnabled ? RECURSE : IGNORED, isUseListDirectoryCache(session),
hdfsContext.getIdentity(), buildDirectoryContextProperties(session), session.getRuntimeStats());
isSkipEmptyFilesEnabled(session), hdfsContext.getIdentity(), buildDirectoryContextProperties(session), session.getRuntimeStats());
ExtendedFileSystem fs;
try {
fs = hdfsEnvironment.getFileSystem(hdfsContext, path);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -43,19 +43,22 @@ public class HiveFileIterator
private final ListDirectoryOperation listDirectoryOperation;
private final NamenodeStats namenodeStats;
private final NestedDirectoryPolicy nestedDirectoryPolicy;
private final boolean skipEmptyFiles;

private Iterator<HiveFileInfo> remoteIterator = Collections.emptyIterator();

public HiveFileIterator(
Path path,
ListDirectoryOperation listDirectoryOperation,
NamenodeStats namenodeStats,
NestedDirectoryPolicy nestedDirectoryPolicy)
NestedDirectoryPolicy nestedDirectoryPolicy,
boolean skipEmptyFiles)
{
paths.addLast(requireNonNull(path, "path is null"));
this.listDirectoryOperation = requireNonNull(listDirectoryOperation, "listDirectoryOperation is null");
this.namenodeStats = requireNonNull(namenodeStats, "namenodeStats is null");
this.nestedDirectoryPolicy = requireNonNull(nestedDirectoryPolicy, "nestedDirectoryPolicy is null");
this.skipEmptyFiles = skipEmptyFiles;
}

@Override
Expand All @@ -67,7 +70,7 @@ protected HiveFileInfo computeNext()

// Ignore hidden files and directories. Hive ignores files starting with _ and . as well.
String fileName = fileInfo.getPath().getName();
if (fileName.startsWith("_") || fileName.startsWith(".")) {
if (fileName.startsWith("_") || fileName.startsWith(".") || (fileInfo.getLength() == 0 && skipEmptyFiles)) {
continue;
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -165,6 +165,7 @@ public void testDefaults()
.setMaxConcurrentQuickStatsCalls(100)
.setMaxConcurrentParquetQuickStatsCalls(500)
.setCteVirtualBucketCount(128)
.setSkipEmptyFilesEnabled(false)
.setAffinitySchedulingFileSectionSize(new DataSize(256, MEGABYTE)));
}

Expand Down Expand Up @@ -292,6 +293,7 @@ public void testExplicitPropertyMappings()
.put("hive.quick-stats.max-concurrent-calls", "101")
.put("hive.cte-virtual-bucket-count", "256")
.put("hive.affinity-scheduling-file-section-size", "512MB")
.put("hive.skip-empty-files", "true")
.build();

HiveClientConfig expected = new HiveClientConfig()
Expand Down Expand Up @@ -412,6 +414,8 @@ public void testExplicitPropertyMappings()
.setParquetQuickStatsFileMetadataFetchTimeout(new Duration(30, TimeUnit.SECONDS))
.setMaxConcurrentParquetQuickStatsCalls(399)
.setMaxConcurrentQuickStatsCalls(101)
.setAffinitySchedulingFileSectionSize(new DataSize(512, MEGABYTE))
.setSkipEmptyFilesEnabled(true)
.setCteVirtualBucketCount(256)
.setAffinitySchedulingFileSectionSize(new DataSize(512, MEGABYTE));

Expand Down
Loading