|
17 | 17 |
|
18 | 18 | package org.apache.gobblin.data.management.copy.hive;
|
19 | 19 |
|
20 |
| -import com.google.common.base.Throwables; |
21 | 20 | import java.io.IOException;
|
22 | 21 | import java.lang.reflect.InvocationTargetException;
|
23 | 22 | import java.net.URISyntaxException;
|
24 | 23 | import java.util.Collection;
|
25 | 24 | import java.util.Iterator;
|
26 | 25 | import java.util.List;
|
27 | 26 | import java.util.Properties;
|
28 |
| - |
29 |
| -import javax.annotation.Nonnull; |
30 |
| - |
31 |
| -import lombok.Data; |
32 |
| -import lombok.Getter; |
33 |
| -import lombok.extern.slf4j.Slf4j; |
| 27 | +import java.util.regex.Pattern; |
34 | 28 |
|
35 | 29 | import org.apache.commons.lang3.StringUtils;
|
36 | 30 | import org.apache.commons.lang3.reflect.ConstructorUtils;
|
|
43 | 37 | import com.google.common.base.Optional;
|
44 | 38 | import com.google.common.base.Preconditions;
|
45 | 39 | import com.google.common.base.Predicate;
|
| 40 | +import com.google.common.base.Throwables; |
46 | 41 | import com.google.common.collect.AbstractIterator;
|
47 | 42 | import com.google.common.collect.Iterables;
|
48 | 43 | import com.google.common.collect.Lists;
|
49 | 44 | import com.typesafe.config.Config;
|
50 | 45 | import com.typesafe.config.ConfigFactory;
|
51 | 46 |
|
| 47 | +import javax.annotation.Nonnull; |
| 48 | +import lombok.Data; |
| 49 | +import lombok.Getter; |
| 50 | +import lombok.extern.slf4j.Slf4j; |
| 51 | + |
52 | 52 | import org.apache.gobblin.config.client.ConfigClient;
|
53 | 53 | import org.apache.gobblin.config.client.ConfigClientCache;
|
54 | 54 | import org.apache.gobblin.config.client.ConfigClientUtils;
|
@@ -80,6 +80,9 @@ public class HiveDatasetFinder implements IterableDatasetFinder<HiveDataset> {
|
80 | 80 | public static final String DEFAULT_TABLE_PATTERN = "*";
|
81 | 81 | public static final String TABLE_FILTER = HIVE_DATASET_PREFIX + ".tableFilter";
|
82 | 82 |
|
| 83 | + // Property used to filter tables only physically within a folder, represented by a regex |
| 84 | + public static final String TABLE_FOLDER_ALLOWLIST_FILTER = HIVE_DATASET_PREFIX + ".tableFolderAllowlistFilter"; |
| 85 | + |
83 | 86 | /*
|
84 | 87 | * By setting the prefix, only config keys with this prefix will be used to build a HiveDataset.
|
85 | 88 | * By passing scoped configurations the same config keys can be used in different contexts.
|
@@ -118,6 +121,8 @@ public class HiveDatasetFinder implements IterableDatasetFinder<HiveDataset> {
|
118 | 121 | protected final Function<Table, String> configStoreDatasetUriBuilder;
|
119 | 122 | protected final Optional<Predicate<Table>> tableFilter;
|
120 | 123 |
|
| 124 | + protected final Optional<Pattern> tableFolderAllowlistRegex; |
| 125 | + |
121 | 126 | protected final String datasetConfigPrefix;
|
122 | 127 | protected final ConfigClient configClient;
|
123 | 128 | private final Config jobConfig;
|
@@ -194,6 +199,8 @@ protected HiveDatasetFinder(FileSystem fs, Properties properties, HiveMetastoreC
|
194 | 199 | } else {
|
195 | 200 | this.tableFilter = Optional.absent();
|
196 | 201 | }
|
| 202 | + this.tableFolderAllowlistRegex = properties.containsKey(TABLE_FOLDER_ALLOWLIST_FILTER) ? |
| 203 | + Optional.of(Pattern.compile(properties.getProperty(TABLE_FOLDER_ALLOWLIST_FILTER))): Optional.absent(); |
197 | 204 | }
|
198 | 205 |
|
199 | 206 | protected static HiveMetastoreClientPool createClientPool(Properties properties) throws IOException {
|
@@ -262,7 +269,10 @@ protected HiveDataset computeNext() {
|
262 | 269 |
|
263 | 270 | try (AutoReturnableObject<IMetaStoreClient> client = HiveDatasetFinder.this.clientPool.getClient()) {
|
264 | 271 | Table table = client.get().getTable(dbAndTable.getDb(), dbAndTable.getTable());
|
265 |
| - if (tableFilter.isPresent() && !tableFilter.get().apply(table)) { |
| 272 | + if ((tableFilter.isPresent() && !tableFilter.get().apply(table)) |
| 273 | + || !shouldAllowTableLocation(tableFolderAllowlistRegex, table)) { |
| 274 | + log.info("Ignoring table {} as its underlying location {} does not pass allowlist regex {}", dbAndTable, |
| 275 | + table.getSd().getLocation(), tableFolderAllowlistRegex.get()); |
266 | 276 | continue;
|
267 | 277 | }
|
268 | 278 |
|
@@ -294,6 +304,12 @@ protected HiveDataset computeNext() {
|
294 | 304 | };
|
295 | 305 | }
|
296 | 306 |
|
| 307 | + protected static boolean shouldAllowTableLocation(Optional<Pattern> regex, Table table) { |
| 308 | + if (!regex.isPresent()) { |
| 309 | + return true; |
| 310 | + } |
| 311 | + return regex.get().matcher(table.getSd().getLocation()).matches(); |
| 312 | + } |
297 | 313 |
|
298 | 314 | /**
|
299 | 315 | * @deprecated Use {@link #createHiveDataset(Table, Config)} instead
|
|
0 commit comments