diff --git a/gobblin-service/src/main/java/org/apache/gobblin/service/modules/dataset/ExternalUriDatasetDescriptor.java b/gobblin-service/src/main/java/org/apache/gobblin/service/modules/dataset/ExternalUriDatasetDescriptor.java new file mode 100644 index 00000000000..5bc47a59e10 --- /dev/null +++ b/gobblin-service/src/main/java/org/apache/gobblin/service/modules/dataset/ExternalUriDatasetDescriptor.java @@ -0,0 +1,64 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.gobblin.service.modules.dataset; + +import com.google.common.base.Preconditions; +import com.typesafe.config.Config; +import java.io.IOException; +import java.util.ArrayList; +import lombok.Getter; +import org.apache.gobblin.service.modules.flowgraph.DatasetDescriptorConfigKeys; +import org.apache.gobblin.service.modules.flowgraph.DatasetDescriptorErrorUtils; + + +/** + * Describes a external dataset not on HDFS, for usage with Data-Integration-Library in a generic way - see: https://github.com/linkedin/data-integration-library/tree/master + * Datasets under ExternalUriDatasetDescriptor can also be represented by more specific dataset descriptors, e.g. HttpDatasetDescriptor, SqlDatasetDescriptor, etc. + * e.g, https://some-api:443/user/123/names for a http URI + * e.g, jdbc:mysql://some-db:3306/db for a sql URI + */ +public class ExternalUriDatasetDescriptor extends BaseDatasetDescriptor implements DatasetDescriptor { + + @Getter + private final String uri; + + public ExternalUriDatasetDescriptor(Config config) throws IOException { + super(config); + Preconditions.checkArgument(config.hasPath(DatasetDescriptorConfigKeys.URI_KEY), "Dataset descriptor config must specify a URI"); + // refers to an external URI of a given dataset, see https://github.com/linkedin/data-integration-library/blob/master/docs/parameters/ms.source.uri.md + this.uri = config.getString(DatasetDescriptorConfigKeys.URI_KEY); + } + + @Override + public String getPath() { + return this.uri; + } + + /** + * Check if this dataset descriptor is equivalent to another dataset descriptor + * + * @param inputDatasetDescriptorConfig whose path should be in the format of an external path (e.g. http url) + */ + @Override + protected ArrayList isPathContaining(DatasetDescriptor inputDatasetDescriptorConfig) { + ArrayList errors = new ArrayList<>(); + String otherPath = inputDatasetDescriptorConfig.getPath(); + DatasetDescriptorErrorUtils.populateErrorForDatasetDescriptorKey(errors, inputDatasetDescriptorConfig.getIsInputDataset(), DatasetDescriptorConfigKeys.URI_KEY, this.getPath(), otherPath, false); + return errors; + } +} diff --git a/gobblin-service/src/main/java/org/apache/gobblin/service/modules/flowgraph/DatasetDescriptorConfigKeys.java b/gobblin-service/src/main/java/org/apache/gobblin/service/modules/flowgraph/DatasetDescriptorConfigKeys.java index 36d473b0538..3e98346f1a6 100644 --- a/gobblin-service/src/main/java/org/apache/gobblin/service/modules/flowgraph/DatasetDescriptorConfigKeys.java +++ b/gobblin-service/src/main/java/org/apache/gobblin/service/modules/flowgraph/DatasetDescriptorConfigKeys.java @@ -33,6 +33,8 @@ public class DatasetDescriptorConfigKeys { public static final String PATH_KEY = "path"; public static final String SUBPATHS_KEY = "subPaths"; public static final String FS_URI_KEY = "fs.uri"; + + public static final String URI_KEY = "uri"; public static final String DATABASE_KEY = "databaseName"; public static final String TABLE_KEY = "tableName"; public static final String FORMAT_KEY = "format"; diff --git a/gobblin-service/src/main/java/org/apache/gobblin/service/modules/flowgraph/datanodes/ExternalUriDataNode.java b/gobblin-service/src/main/java/org/apache/gobblin/service/modules/flowgraph/datanodes/ExternalUriDataNode.java new file mode 100644 index 00000000000..903654d6e90 --- /dev/null +++ b/gobblin-service/src/main/java/org/apache/gobblin/service/modules/flowgraph/datanodes/ExternalUriDataNode.java @@ -0,0 +1,44 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.gobblin.service.modules.flowgraph.datanodes; + +import com.typesafe.config.Config; +import org.apache.gobblin.service.modules.dataset.ExternalUriDatasetDescriptor; +import org.apache.gobblin.service.modules.flowgraph.BaseDataNode; + + +/** + * A DataNode for generic ingress/egress data movement outside of HDFS (HTTP or otherwise) + */ +public class ExternalUriDataNode extends BaseDataNode { + public static final String EXTERNAL_PLATFORM_NAME = "external"; + + public ExternalUriDataNode(Config nodeProps) throws DataNodeCreationException { + super(nodeProps); + } + + @Override + public String getDefaultDatasetDescriptorPlatform() { + return EXTERNAL_PLATFORM_NAME; + } + + @Override + public String getDefaultDatasetDescriptorClass() { + return ExternalUriDatasetDescriptor.class.getCanonicalName(); + } +} diff --git a/gobblin-service/src/test/java/org/apache/gobblin/service/modules/dataset/ExternalUriDatasetDescriptorTest.java b/gobblin-service/src/test/java/org/apache/gobblin/service/modules/dataset/ExternalUriDatasetDescriptorTest.java new file mode 100644 index 00000000000..4c5b313c9f6 --- /dev/null +++ b/gobblin-service/src/test/java/org/apache/gobblin/service/modules/dataset/ExternalUriDatasetDescriptorTest.java @@ -0,0 +1,53 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.gobblin.service.modules.dataset; + +import com.typesafe.config.Config; +import com.typesafe.config.ConfigFactory; +import com.typesafe.config.ConfigValueFactory; +import java.io.IOException; +import org.apache.gobblin.service.modules.flowgraph.DatasetDescriptorConfigKeys; +import org.junit.Assert; +import org.testng.annotations.Test; + + +public class ExternalUriDatasetDescriptorTest { + + @Test + public void testContains() throws IOException { + Config config1 = ConfigFactory.empty() + .withValue(DatasetDescriptorConfigKeys.PLATFORM_KEY, ConfigValueFactory.fromAnyRef("external")) + .withValue(DatasetDescriptorConfigKeys.URI_KEY, ConfigValueFactory.fromAnyRef("https://a.com/b")); + ExternalUriDatasetDescriptor descriptor1 = new ExternalUriDatasetDescriptor(config1); + + // Verify that same path points to same dataset + Config config2 = ConfigFactory.empty() + .withValue(DatasetDescriptorConfigKeys.PLATFORM_KEY, ConfigValueFactory.fromAnyRef("external")) + .withValue(DatasetDescriptorConfigKeys.URI_KEY, ConfigValueFactory.fromAnyRef("https://a.com/b")); + ExternalUriDatasetDescriptor descriptor2 = new ExternalUriDatasetDescriptor(config2); + Assert.assertEquals(descriptor2.contains(descriptor1).size(), 0); + + // Verify that different path points to different dataset + Config config3 = ConfigFactory.empty() + .withValue(DatasetDescriptorConfigKeys.PLATFORM_KEY, ConfigValueFactory.fromAnyRef("external")) + .withValue(DatasetDescriptorConfigKeys.URI_KEY, ConfigValueFactory.fromAnyRef("https://a.com/c")); + ExternalUriDatasetDescriptor descriptor3 = new ExternalUriDatasetDescriptor(config3); + Assert.assertNotEquals(descriptor3.contains(descriptor1).size(), 0); + + } +} \ No newline at end of file diff --git a/gobblin-service/src/test/java/org/apache/gobblin/service/modules/flowgraph/datanodes/ExternalDataNodeTest.java b/gobblin-service/src/test/java/org/apache/gobblin/service/modules/flowgraph/datanodes/ExternalDataNodeTest.java new file mode 100644 index 00000000000..f44c58dcdd9 --- /dev/null +++ b/gobblin-service/src/test/java/org/apache/gobblin/service/modules/flowgraph/datanodes/ExternalDataNodeTest.java @@ -0,0 +1,46 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.gobblin.service.modules.flowgraph.datanodes; + +import com.typesafe.config.Config; +import com.typesafe.config.ConfigFactory; +import com.typesafe.config.ConfigValueFactory; +import org.apache.gobblin.service.modules.dataset.ExternalUriDatasetDescriptor; +import org.apache.gobblin.service.modules.flowgraph.DataNode; +import org.apache.gobblin.service.modules.flowgraph.FlowGraphConfigurationKeys; +import org.junit.Assert; +import org.testng.annotations.Test; + + +public class ExternalDataNodeTest { + + @Test + public void testConfig() throws DataNode.DataNodeCreationException { + String expectedNodeId = "some-node-id"; + + Config config = ConfigFactory.empty() + .withValue(FlowGraphConfigurationKeys.DATA_NODE_ID_KEY, ConfigValueFactory.fromAnyRef(expectedNodeId)); + ExternalUriDataNode node = new ExternalUriDataNode(config); + + // Verify the node id + String id = node.getId(); + Assert.assertEquals(id, expectedNodeId); + Assert.assertEquals(node.getDefaultDatasetDescriptorPlatform(), "external"); + Assert.assertEquals(node.getDefaultDatasetDescriptorClass(), ExternalUriDatasetDescriptor.class.getCanonicalName()); + } +} \ No newline at end of file