From 74f9f55b3f2728f1044674a24a77fccd73715f5b Mon Sep 17 00:00:00 2001 From: William Lo Date: Tue, 28 Nov 2023 15:09:19 -0500 Subject: [PATCH 1/5] Add external data node for generic ingress/egress on GaaS --- .../dataset/ExternalDatasetDescriptor.java | 66 +++++++++++++++++++ .../flowgraph/datanodes/ExternalDataNode.java | 27 ++++++++ .../ExternalDatasetDescriptorTest.java | 53 +++++++++++++++ .../datanodes/ExternalDataNodeTest.java | 47 +++++++++++++ 4 files changed, 193 insertions(+) create mode 100644 gobblin-service/src/main/java/org/apache/gobblin/service/modules/dataset/ExternalDatasetDescriptor.java create mode 100644 gobblin-service/src/main/java/org/apache/gobblin/service/modules/flowgraph/datanodes/ExternalDataNode.java create mode 100644 gobblin-service/src/test/java/org/apache/gobblin/service/modules/dataset/ExternalDatasetDescriptorTest.java create mode 100644 gobblin-service/src/test/java/org/apache/gobblin/service/modules/flowgraph/datanodes/ExternalDataNodeTest.java diff --git a/gobblin-service/src/main/java/org/apache/gobblin/service/modules/dataset/ExternalDatasetDescriptor.java b/gobblin-service/src/main/java/org/apache/gobblin/service/modules/dataset/ExternalDatasetDescriptor.java new file mode 100644 index 00000000000..faccf560cfa --- /dev/null +++ b/gobblin-service/src/main/java/org/apache/gobblin/service/modules/dataset/ExternalDatasetDescriptor.java @@ -0,0 +1,66 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.gobblin.service.modules.dataset; + +import com.typesafe.config.Config; +import com.typesafe.config.ConfigValueFactory; +import java.io.IOException; +import java.util.ArrayList; +import lombok.EqualsAndHashCode; +import lombok.Getter; +import lombok.ToString; +import org.apache.gobblin.service.modules.flowgraph.DatasetDescriptorConfigKeys; +import org.apache.gobblin.service.modules.flowgraph.DatasetDescriptorErrorUtils; +import org.apache.gobblin.util.ConfigUtils; + + +/** + * Describes a external dataset not on HDFS + * e.g, https://some-api:443/user/123/names where the path is the full URI + */ +@ToString (exclude = {"rawConfig"}) +@EqualsAndHashCode (exclude = {"rawConfig"}, callSuper = true) +public class ExternalDatasetDescriptor extends BaseDatasetDescriptor implements DatasetDescriptor { + + @Getter + private final String path; + @Getter + private final Config rawConfig; + + public ExternalDatasetDescriptor(Config config) throws IOException { + super(config); + // refers to the full HTTP url + this.path = ConfigUtils.getString(config, DatasetDescriptorConfigKeys.PATH_KEY, ""); + this.rawConfig = config.withValue(DatasetDescriptorConfigKeys.PATH_KEY, ConfigValueFactory.fromAnyRef(this.path)).withFallback(super.getRawConfig()); + this.isInputDataset = ConfigUtils.getBoolean(config, DatasetDescriptorConfigKeys.IS_INPUT_DATASET, false); + } + + /** + * Check if this HTTP path equals the other HTTP path + * + * @param inputDatasetDescriptorConfig whose path should be in the format of a HTTP path + */ + @Override + protected ArrayList isPathContaining(DatasetDescriptor inputDatasetDescriptorConfig) { + // Might be null + ArrayList errors = new ArrayList<>(); + String otherPath = inputDatasetDescriptorConfig.getPath(); + DatasetDescriptorErrorUtils.populateErrorForDatasetDescriptorKey(errors, inputDatasetDescriptorConfig.getIsInputDataset(), DatasetDescriptorConfigKeys.PATH_KEY, this.getPath(), otherPath, false); + return errors; + } +} diff --git a/gobblin-service/src/main/java/org/apache/gobblin/service/modules/flowgraph/datanodes/ExternalDataNode.java b/gobblin-service/src/main/java/org/apache/gobblin/service/modules/flowgraph/datanodes/ExternalDataNode.java new file mode 100644 index 00000000000..712d6c9e73e --- /dev/null +++ b/gobblin-service/src/main/java/org/apache/gobblin/service/modules/flowgraph/datanodes/ExternalDataNode.java @@ -0,0 +1,27 @@ +package org.apache.gobblin.service.modules.flowgraph.datanodes; + +import com.typesafe.config.Config; +import org.apache.gobblin.service.modules.dataset.ExternalDatasetDescriptor; +import org.apache.gobblin.service.modules.flowgraph.BaseDataNode; + + +/** + * A DataNode for generic ingress/egress data movement outside of HDFS (HTTP or otherwise) + */ +public class ExternalDataNode extends BaseDataNode { + public static final String EXTERNAL_PLATFORM_NAME = "external"; + + public ExternalDataNode(Config nodeProps) throws DataNodeCreationException { + super(nodeProps); + } + + @Override + public String getDefaultDatasetDescriptorPlatform() { + return EXTERNAL_PLATFORM_NAME; + } + + @Override + public String getDefaultDatasetDescriptorClass() { + return ExternalDatasetDescriptor.class.getCanonicalName(); + } +} diff --git a/gobblin-service/src/test/java/org/apache/gobblin/service/modules/dataset/ExternalDatasetDescriptorTest.java b/gobblin-service/src/test/java/org/apache/gobblin/service/modules/dataset/ExternalDatasetDescriptorTest.java new file mode 100644 index 00000000000..1e1525e96b4 --- /dev/null +++ b/gobblin-service/src/test/java/org/apache/gobblin/service/modules/dataset/ExternalDatasetDescriptorTest.java @@ -0,0 +1,53 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.gobblin.service.modules.dataset; + +import com.typesafe.config.Config; +import com.typesafe.config.ConfigFactory; +import com.typesafe.config.ConfigValueFactory; +import java.io.IOException; +import org.apache.gobblin.service.modules.flowgraph.DatasetDescriptorConfigKeys; +import org.junit.Assert; +import org.testng.annotations.Test; + + +public class ExternalDatasetDescriptorTest { + + @Test + public void testContains() throws IOException { + Config config1 = ConfigFactory.empty() + .withValue(DatasetDescriptorConfigKeys.PLATFORM_KEY, ConfigValueFactory.fromAnyRef("external")) + .withValue(DatasetDescriptorConfigKeys.PATH_KEY, ConfigValueFactory.fromAnyRef("https://a.com/b")); + ExternalDatasetDescriptor descriptor1 = new ExternalDatasetDescriptor(config1); + + // Verify that same path points to same dataset + Config config2 = ConfigFactory.empty() + .withValue(DatasetDescriptorConfigKeys.PLATFORM_KEY, ConfigValueFactory.fromAnyRef("external")) + .withValue(DatasetDescriptorConfigKeys.PATH_KEY, ConfigValueFactory.fromAnyRef("https://a.com/b")); + ExternalDatasetDescriptor descriptor2 = new ExternalDatasetDescriptor(config2); + Assert.assertEquals(descriptor2.contains(descriptor1).size(), 0); + + // Verify that same path but different platform points to different dataset + Config config3 = ConfigFactory.empty() + .withValue(DatasetDescriptorConfigKeys.PLATFORM_KEY, ConfigValueFactory.fromAnyRef("external")) + .withValue(DatasetDescriptorConfigKeys.PATH_KEY, ConfigValueFactory.fromAnyRef("https://a.com/c")); + ExternalDatasetDescriptor descriptor3 = new ExternalDatasetDescriptor(config3); + Assert.assertNotEquals(descriptor3.contains(descriptor1).size(), 0); + + } +} \ No newline at end of file diff --git a/gobblin-service/src/test/java/org/apache/gobblin/service/modules/flowgraph/datanodes/ExternalDataNodeTest.java b/gobblin-service/src/test/java/org/apache/gobblin/service/modules/flowgraph/datanodes/ExternalDataNodeTest.java new file mode 100644 index 00000000000..99ecff0a4f7 --- /dev/null +++ b/gobblin-service/src/test/java/org/apache/gobblin/service/modules/flowgraph/datanodes/ExternalDataNodeTest.java @@ -0,0 +1,47 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.gobblin.service.modules.flowgraph.datanodes; + +import com.typesafe.config.Config; +import com.typesafe.config.ConfigFactory; +import com.typesafe.config.ConfigValueFactory; +import org.apache.gobblin.service.modules.dataset.ExternalDatasetDescriptor; +import org.apache.gobblin.service.modules.flowgraph.DataNode; +import org.apache.gobblin.service.modules.flowgraph.FlowGraphConfigurationKeys; +import org.apache.gobblin.util.ConfigUtils; +import org.junit.Assert; +import org.testng.annotations.Test; + + +public class ExternalDataNodeTest { + + @Test + public void testConfig() throws DataNode.DataNodeCreationException { + String expectedNodeId = "some-node-id"; + + Config config = ConfigFactory.empty() + .withValue(FlowGraphConfigurationKeys.DATA_NODE_ID_KEY, ConfigValueFactory.fromAnyRef(expectedNodeId)); + ExternalDataNode node = new ExternalDataNode(config); + + // Verify the node id + String id = node.getId(); + Assert.assertEquals(id, expectedNodeId); + Assert.assertEquals(node.getDefaultDatasetDescriptorPlatform(), "external"); + Assert.assertEquals(node.getDefaultDatasetDescriptorClass(), ExternalDatasetDescriptor.class.getCanonicalName()); + } +} \ No newline at end of file From 9119ad3159f6afc2d1f5a8195a6f1cb4b7400974 Mon Sep 17 00:00:00 2001 From: William Lo Date: Wed, 29 Nov 2023 15:59:38 -0500 Subject: [PATCH 2/5] Address reviews and cleanup --- .../dataset/ExternalDatasetDescriptor.java | 16 ++++------------ .../flowgraph/datanodes/ExternalDataNode.java | 17 +++++++++++++++++ .../dataset/ExternalDatasetDescriptorTest.java | 2 +- 3 files changed, 22 insertions(+), 13 deletions(-) diff --git a/gobblin-service/src/main/java/org/apache/gobblin/service/modules/dataset/ExternalDatasetDescriptor.java b/gobblin-service/src/main/java/org/apache/gobblin/service/modules/dataset/ExternalDatasetDescriptor.java index faccf560cfa..ee03af0439d 100644 --- a/gobblin-service/src/main/java/org/apache/gobblin/service/modules/dataset/ExternalDatasetDescriptor.java +++ b/gobblin-service/src/main/java/org/apache/gobblin/service/modules/dataset/ExternalDatasetDescriptor.java @@ -17,13 +17,11 @@ package org.apache.gobblin.service.modules.dataset; +import com.google.common.base.Preconditions; import com.typesafe.config.Config; -import com.typesafe.config.ConfigValueFactory; import java.io.IOException; import java.util.ArrayList; -import lombok.EqualsAndHashCode; import lombok.Getter; -import lombok.ToString; import org.apache.gobblin.service.modules.flowgraph.DatasetDescriptorConfigKeys; import org.apache.gobblin.service.modules.flowgraph.DatasetDescriptorErrorUtils; import org.apache.gobblin.util.ConfigUtils; @@ -33,31 +31,25 @@ * Describes a external dataset not on HDFS * e.g, https://some-api:443/user/123/names where the path is the full URI */ -@ToString (exclude = {"rawConfig"}) -@EqualsAndHashCode (exclude = {"rawConfig"}, callSuper = true) public class ExternalDatasetDescriptor extends BaseDatasetDescriptor implements DatasetDescriptor { @Getter private final String path; - @Getter - private final Config rawConfig; public ExternalDatasetDescriptor(Config config) throws IOException { super(config); + Preconditions.checkArgument(config.hasPath(DatasetDescriptorConfigKeys.PATH_KEY), "Dataset descriptor config must specify path"); // refers to the full HTTP url this.path = ConfigUtils.getString(config, DatasetDescriptorConfigKeys.PATH_KEY, ""); - this.rawConfig = config.withValue(DatasetDescriptorConfigKeys.PATH_KEY, ConfigValueFactory.fromAnyRef(this.path)).withFallback(super.getRawConfig()); - this.isInputDataset = ConfigUtils.getBoolean(config, DatasetDescriptorConfigKeys.IS_INPUT_DATASET, false); } /** - * Check if this HTTP path equals the other HTTP path + * Check if this dataset descriptor is equivalent to another dataset descriptor * - * @param inputDatasetDescriptorConfig whose path should be in the format of a HTTP path + * @param inputDatasetDescriptorConfig whose path should be in the format of an external path (e.g. http url) */ @Override protected ArrayList isPathContaining(DatasetDescriptor inputDatasetDescriptorConfig) { - // Might be null ArrayList errors = new ArrayList<>(); String otherPath = inputDatasetDescriptorConfig.getPath(); DatasetDescriptorErrorUtils.populateErrorForDatasetDescriptorKey(errors, inputDatasetDescriptorConfig.getIsInputDataset(), DatasetDescriptorConfigKeys.PATH_KEY, this.getPath(), otherPath, false); diff --git a/gobblin-service/src/main/java/org/apache/gobblin/service/modules/flowgraph/datanodes/ExternalDataNode.java b/gobblin-service/src/main/java/org/apache/gobblin/service/modules/flowgraph/datanodes/ExternalDataNode.java index 712d6c9e73e..c114bf8fc82 100644 --- a/gobblin-service/src/main/java/org/apache/gobblin/service/modules/flowgraph/datanodes/ExternalDataNode.java +++ b/gobblin-service/src/main/java/org/apache/gobblin/service/modules/flowgraph/datanodes/ExternalDataNode.java @@ -1,3 +1,20 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + package org.apache.gobblin.service.modules.flowgraph.datanodes; import com.typesafe.config.Config; diff --git a/gobblin-service/src/test/java/org/apache/gobblin/service/modules/dataset/ExternalDatasetDescriptorTest.java b/gobblin-service/src/test/java/org/apache/gobblin/service/modules/dataset/ExternalDatasetDescriptorTest.java index 1e1525e96b4..fa646a2c6a4 100644 --- a/gobblin-service/src/test/java/org/apache/gobblin/service/modules/dataset/ExternalDatasetDescriptorTest.java +++ b/gobblin-service/src/test/java/org/apache/gobblin/service/modules/dataset/ExternalDatasetDescriptorTest.java @@ -42,7 +42,7 @@ public void testContains() throws IOException { ExternalDatasetDescriptor descriptor2 = new ExternalDatasetDescriptor(config2); Assert.assertEquals(descriptor2.contains(descriptor1).size(), 0); - // Verify that same path but different platform points to different dataset + // Verify that different path points to different dataset Config config3 = ConfigFactory.empty() .withValue(DatasetDescriptorConfigKeys.PLATFORM_KEY, ConfigValueFactory.fromAnyRef("external")) .withValue(DatasetDescriptorConfigKeys.PATH_KEY, ConfigValueFactory.fromAnyRef("https://a.com/c")); From 47f004bf98cd5bbe4d76c0b7e8fbd403a5ec32b1 Mon Sep 17 00:00:00 2001 From: William Lo Date: Wed, 29 Nov 2023 20:58:42 -0500 Subject: [PATCH 3/5] Use URI representation for external dataset descriptor node --- ...java => ExternalUriDatasetDescriptor.java} | 24 ++++++++++++------- .../DatasetDescriptorConfigKeys.java | 2 ++ .../flowgraph/datanodes/ExternalDataNode.java | 4 ++-- ... => ExternalUriDatasetDescriptorTest.java} | 14 +++++------ .../datanodes/ExternalDataNodeTest.java | 5 ++-- 5 files changed, 28 insertions(+), 21 deletions(-) rename gobblin-service/src/main/java/org/apache/gobblin/service/modules/dataset/{ExternalDatasetDescriptor.java => ExternalUriDatasetDescriptor.java} (65%) rename gobblin-service/src/test/java/org/apache/gobblin/service/modules/dataset/{ExternalDatasetDescriptorTest.java => ExternalUriDatasetDescriptorTest.java} (74%) diff --git a/gobblin-service/src/main/java/org/apache/gobblin/service/modules/dataset/ExternalDatasetDescriptor.java b/gobblin-service/src/main/java/org/apache/gobblin/service/modules/dataset/ExternalUriDatasetDescriptor.java similarity index 65% rename from gobblin-service/src/main/java/org/apache/gobblin/service/modules/dataset/ExternalDatasetDescriptor.java rename to gobblin-service/src/main/java/org/apache/gobblin/service/modules/dataset/ExternalUriDatasetDescriptor.java index ee03af0439d..6884a6f53e5 100644 --- a/gobblin-service/src/main/java/org/apache/gobblin/service/modules/dataset/ExternalDatasetDescriptor.java +++ b/gobblin-service/src/main/java/org/apache/gobblin/service/modules/dataset/ExternalUriDatasetDescriptor.java @@ -24,23 +24,29 @@ import lombok.Getter; import org.apache.gobblin.service.modules.flowgraph.DatasetDescriptorConfigKeys; import org.apache.gobblin.service.modules.flowgraph.DatasetDescriptorErrorUtils; -import org.apache.gobblin.util.ConfigUtils; /** - * Describes a external dataset not on HDFS - * e.g, https://some-api:443/user/123/names where the path is the full URI + * Describes a external dataset not on HDFS, for usage with Data-Integration-Library in a generic way - see: https://github.com/linkedin/data-integration-library/tree/master + * Datasets under ExternalUriDatasetDescriptor can also be represented by more specific dataset descriptors, e.g. HttpDatasetDescriptor, SqlDatasetDescriptor, etc. + * e.g, https://some-api:443/user/123/names for a http URI + * e.g, jdbc:mysql://some-db:3306/db for a sql URI */ -public class ExternalDatasetDescriptor extends BaseDatasetDescriptor implements DatasetDescriptor { +public class ExternalUriDatasetDescriptor extends BaseDatasetDescriptor implements DatasetDescriptor { @Getter - private final String path; + private final String uri; - public ExternalDatasetDescriptor(Config config) throws IOException { + public ExternalUriDatasetDescriptor(Config config) throws IOException { super(config); - Preconditions.checkArgument(config.hasPath(DatasetDescriptorConfigKeys.PATH_KEY), "Dataset descriptor config must specify path"); - // refers to the full HTTP url - this.path = ConfigUtils.getString(config, DatasetDescriptorConfigKeys.PATH_KEY, ""); + Preconditions.checkArgument(config.hasPath(DatasetDescriptorConfigKeys.URI_KEY), "Dataset descriptor config must specify a uri"); + // refers to an external URI of a given dataset, see https://github.com/linkedin/data-integration-library/blob/master/docs/parameters/ms.source.uri.md + this.uri = config.getString(DatasetDescriptorConfigKeys.URI_KEY); + } + + @Override + public String getPath() { + return this.uri; } /** diff --git a/gobblin-service/src/main/java/org/apache/gobblin/service/modules/flowgraph/DatasetDescriptorConfigKeys.java b/gobblin-service/src/main/java/org/apache/gobblin/service/modules/flowgraph/DatasetDescriptorConfigKeys.java index 36d473b0538..3e98346f1a6 100644 --- a/gobblin-service/src/main/java/org/apache/gobblin/service/modules/flowgraph/DatasetDescriptorConfigKeys.java +++ b/gobblin-service/src/main/java/org/apache/gobblin/service/modules/flowgraph/DatasetDescriptorConfigKeys.java @@ -33,6 +33,8 @@ public class DatasetDescriptorConfigKeys { public static final String PATH_KEY = "path"; public static final String SUBPATHS_KEY = "subPaths"; public static final String FS_URI_KEY = "fs.uri"; + + public static final String URI_KEY = "uri"; public static final String DATABASE_KEY = "databaseName"; public static final String TABLE_KEY = "tableName"; public static final String FORMAT_KEY = "format"; diff --git a/gobblin-service/src/main/java/org/apache/gobblin/service/modules/flowgraph/datanodes/ExternalDataNode.java b/gobblin-service/src/main/java/org/apache/gobblin/service/modules/flowgraph/datanodes/ExternalDataNode.java index c114bf8fc82..937bb483789 100644 --- a/gobblin-service/src/main/java/org/apache/gobblin/service/modules/flowgraph/datanodes/ExternalDataNode.java +++ b/gobblin-service/src/main/java/org/apache/gobblin/service/modules/flowgraph/datanodes/ExternalDataNode.java @@ -18,7 +18,7 @@ package org.apache.gobblin.service.modules.flowgraph.datanodes; import com.typesafe.config.Config; -import org.apache.gobblin.service.modules.dataset.ExternalDatasetDescriptor; +import org.apache.gobblin.service.modules.dataset.ExternalUriDatasetDescriptor; import org.apache.gobblin.service.modules.flowgraph.BaseDataNode; @@ -39,6 +39,6 @@ public String getDefaultDatasetDescriptorPlatform() { @Override public String getDefaultDatasetDescriptorClass() { - return ExternalDatasetDescriptor.class.getCanonicalName(); + return ExternalUriDatasetDescriptor.class.getCanonicalName(); } } diff --git a/gobblin-service/src/test/java/org/apache/gobblin/service/modules/dataset/ExternalDatasetDescriptorTest.java b/gobblin-service/src/test/java/org/apache/gobblin/service/modules/dataset/ExternalUriDatasetDescriptorTest.java similarity index 74% rename from gobblin-service/src/test/java/org/apache/gobblin/service/modules/dataset/ExternalDatasetDescriptorTest.java rename to gobblin-service/src/test/java/org/apache/gobblin/service/modules/dataset/ExternalUriDatasetDescriptorTest.java index fa646a2c6a4..4c5b313c9f6 100644 --- a/gobblin-service/src/test/java/org/apache/gobblin/service/modules/dataset/ExternalDatasetDescriptorTest.java +++ b/gobblin-service/src/test/java/org/apache/gobblin/service/modules/dataset/ExternalUriDatasetDescriptorTest.java @@ -26,27 +26,27 @@ import org.testng.annotations.Test; -public class ExternalDatasetDescriptorTest { +public class ExternalUriDatasetDescriptorTest { @Test public void testContains() throws IOException { Config config1 = ConfigFactory.empty() .withValue(DatasetDescriptorConfigKeys.PLATFORM_KEY, ConfigValueFactory.fromAnyRef("external")) - .withValue(DatasetDescriptorConfigKeys.PATH_KEY, ConfigValueFactory.fromAnyRef("https://a.com/b")); - ExternalDatasetDescriptor descriptor1 = new ExternalDatasetDescriptor(config1); + .withValue(DatasetDescriptorConfigKeys.URI_KEY, ConfigValueFactory.fromAnyRef("https://a.com/b")); + ExternalUriDatasetDescriptor descriptor1 = new ExternalUriDatasetDescriptor(config1); // Verify that same path points to same dataset Config config2 = ConfigFactory.empty() .withValue(DatasetDescriptorConfigKeys.PLATFORM_KEY, ConfigValueFactory.fromAnyRef("external")) - .withValue(DatasetDescriptorConfigKeys.PATH_KEY, ConfigValueFactory.fromAnyRef("https://a.com/b")); - ExternalDatasetDescriptor descriptor2 = new ExternalDatasetDescriptor(config2); + .withValue(DatasetDescriptorConfigKeys.URI_KEY, ConfigValueFactory.fromAnyRef("https://a.com/b")); + ExternalUriDatasetDescriptor descriptor2 = new ExternalUriDatasetDescriptor(config2); Assert.assertEquals(descriptor2.contains(descriptor1).size(), 0); // Verify that different path points to different dataset Config config3 = ConfigFactory.empty() .withValue(DatasetDescriptorConfigKeys.PLATFORM_KEY, ConfigValueFactory.fromAnyRef("external")) - .withValue(DatasetDescriptorConfigKeys.PATH_KEY, ConfigValueFactory.fromAnyRef("https://a.com/c")); - ExternalDatasetDescriptor descriptor3 = new ExternalDatasetDescriptor(config3); + .withValue(DatasetDescriptorConfigKeys.URI_KEY, ConfigValueFactory.fromAnyRef("https://a.com/c")); + ExternalUriDatasetDescriptor descriptor3 = new ExternalUriDatasetDescriptor(config3); Assert.assertNotEquals(descriptor3.contains(descriptor1).size(), 0); } diff --git a/gobblin-service/src/test/java/org/apache/gobblin/service/modules/flowgraph/datanodes/ExternalDataNodeTest.java b/gobblin-service/src/test/java/org/apache/gobblin/service/modules/flowgraph/datanodes/ExternalDataNodeTest.java index 99ecff0a4f7..7acca6b2c6f 100644 --- a/gobblin-service/src/test/java/org/apache/gobblin/service/modules/flowgraph/datanodes/ExternalDataNodeTest.java +++ b/gobblin-service/src/test/java/org/apache/gobblin/service/modules/flowgraph/datanodes/ExternalDataNodeTest.java @@ -20,10 +20,9 @@ import com.typesafe.config.Config; import com.typesafe.config.ConfigFactory; import com.typesafe.config.ConfigValueFactory; -import org.apache.gobblin.service.modules.dataset.ExternalDatasetDescriptor; +import org.apache.gobblin.service.modules.dataset.ExternalUriDatasetDescriptor; import org.apache.gobblin.service.modules.flowgraph.DataNode; import org.apache.gobblin.service.modules.flowgraph.FlowGraphConfigurationKeys; -import org.apache.gobblin.util.ConfigUtils; import org.junit.Assert; import org.testng.annotations.Test; @@ -42,6 +41,6 @@ public void testConfig() throws DataNode.DataNodeCreationException { String id = node.getId(); Assert.assertEquals(id, expectedNodeId); Assert.assertEquals(node.getDefaultDatasetDescriptorPlatform(), "external"); - Assert.assertEquals(node.getDefaultDatasetDescriptorClass(), ExternalDatasetDescriptor.class.getCanonicalName()); + Assert.assertEquals(node.getDefaultDatasetDescriptorClass(), ExternalUriDatasetDescriptor.class.getCanonicalName()); } } \ No newline at end of file From f5480f309acc0ce715050f0b2285926c47f9eede Mon Sep 17 00:00:00 2001 From: William Lo Date: Wed, 29 Nov 2023 21:41:42 -0500 Subject: [PATCH 4/5] Fix error message in containing check --- .../service/modules/dataset/ExternalUriDatasetDescriptor.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gobblin-service/src/main/java/org/apache/gobblin/service/modules/dataset/ExternalUriDatasetDescriptor.java b/gobblin-service/src/main/java/org/apache/gobblin/service/modules/dataset/ExternalUriDatasetDescriptor.java index 6884a6f53e5..67ff9a6d936 100644 --- a/gobblin-service/src/main/java/org/apache/gobblin/service/modules/dataset/ExternalUriDatasetDescriptor.java +++ b/gobblin-service/src/main/java/org/apache/gobblin/service/modules/dataset/ExternalUriDatasetDescriptor.java @@ -58,7 +58,7 @@ public String getPath() { protected ArrayList isPathContaining(DatasetDescriptor inputDatasetDescriptorConfig) { ArrayList errors = new ArrayList<>(); String otherPath = inputDatasetDescriptorConfig.getPath(); - DatasetDescriptorErrorUtils.populateErrorForDatasetDescriptorKey(errors, inputDatasetDescriptorConfig.getIsInputDataset(), DatasetDescriptorConfigKeys.PATH_KEY, this.getPath(), otherPath, false); + DatasetDescriptorErrorUtils.populateErrorForDatasetDescriptorKey(errors, inputDatasetDescriptorConfig.getIsInputDataset(), DatasetDescriptorConfigKeys.URI_KEY, this.getPath(), otherPath, false); return errors; } } From fcf7079b0aa5fb04c3133424ecf545391469c958 Mon Sep 17 00:00:00 2001 From: William Lo Date: Thu, 30 Nov 2023 19:01:03 -0500 Subject: [PATCH 5/5] Address review --- .../service/modules/dataset/ExternalUriDatasetDescriptor.java | 2 +- .../{ExternalDataNode.java => ExternalUriDataNode.java} | 4 ++-- .../modules/flowgraph/datanodes/ExternalDataNodeTest.java | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) rename gobblin-service/src/main/java/org/apache/gobblin/service/modules/flowgraph/datanodes/{ExternalDataNode.java => ExternalUriDataNode.java} (91%) diff --git a/gobblin-service/src/main/java/org/apache/gobblin/service/modules/dataset/ExternalUriDatasetDescriptor.java b/gobblin-service/src/main/java/org/apache/gobblin/service/modules/dataset/ExternalUriDatasetDescriptor.java index 67ff9a6d936..5bc47a59e10 100644 --- a/gobblin-service/src/main/java/org/apache/gobblin/service/modules/dataset/ExternalUriDatasetDescriptor.java +++ b/gobblin-service/src/main/java/org/apache/gobblin/service/modules/dataset/ExternalUriDatasetDescriptor.java @@ -39,7 +39,7 @@ public class ExternalUriDatasetDescriptor extends BaseDatasetDescriptor implemen public ExternalUriDatasetDescriptor(Config config) throws IOException { super(config); - Preconditions.checkArgument(config.hasPath(DatasetDescriptorConfigKeys.URI_KEY), "Dataset descriptor config must specify a uri"); + Preconditions.checkArgument(config.hasPath(DatasetDescriptorConfigKeys.URI_KEY), "Dataset descriptor config must specify a URI"); // refers to an external URI of a given dataset, see https://github.com/linkedin/data-integration-library/blob/master/docs/parameters/ms.source.uri.md this.uri = config.getString(DatasetDescriptorConfigKeys.URI_KEY); } diff --git a/gobblin-service/src/main/java/org/apache/gobblin/service/modules/flowgraph/datanodes/ExternalDataNode.java b/gobblin-service/src/main/java/org/apache/gobblin/service/modules/flowgraph/datanodes/ExternalUriDataNode.java similarity index 91% rename from gobblin-service/src/main/java/org/apache/gobblin/service/modules/flowgraph/datanodes/ExternalDataNode.java rename to gobblin-service/src/main/java/org/apache/gobblin/service/modules/flowgraph/datanodes/ExternalUriDataNode.java index 937bb483789..903654d6e90 100644 --- a/gobblin-service/src/main/java/org/apache/gobblin/service/modules/flowgraph/datanodes/ExternalDataNode.java +++ b/gobblin-service/src/main/java/org/apache/gobblin/service/modules/flowgraph/datanodes/ExternalUriDataNode.java @@ -25,10 +25,10 @@ /** * A DataNode for generic ingress/egress data movement outside of HDFS (HTTP or otherwise) */ -public class ExternalDataNode extends BaseDataNode { +public class ExternalUriDataNode extends BaseDataNode { public static final String EXTERNAL_PLATFORM_NAME = "external"; - public ExternalDataNode(Config nodeProps) throws DataNodeCreationException { + public ExternalUriDataNode(Config nodeProps) throws DataNodeCreationException { super(nodeProps); } diff --git a/gobblin-service/src/test/java/org/apache/gobblin/service/modules/flowgraph/datanodes/ExternalDataNodeTest.java b/gobblin-service/src/test/java/org/apache/gobblin/service/modules/flowgraph/datanodes/ExternalDataNodeTest.java index 7acca6b2c6f..f44c58dcdd9 100644 --- a/gobblin-service/src/test/java/org/apache/gobblin/service/modules/flowgraph/datanodes/ExternalDataNodeTest.java +++ b/gobblin-service/src/test/java/org/apache/gobblin/service/modules/flowgraph/datanodes/ExternalDataNodeTest.java @@ -35,7 +35,7 @@ public void testConfig() throws DataNode.DataNodeCreationException { Config config = ConfigFactory.empty() .withValue(FlowGraphConfigurationKeys.DATA_NODE_ID_KEY, ConfigValueFactory.fromAnyRef(expectedNodeId)); - ExternalDataNode node = new ExternalDataNode(config); + ExternalUriDataNode node = new ExternalUriDataNode(config); // Verify the node id String id = node.getId();