diff --git a/docs/hdfs_connector.rst b/docs/hdfs_connector.rst index 8d42fe287..cb8bc3a0e 100644 --- a/docs/hdfs_connector.rst +++ b/docs/hdfs_connector.rst @@ -306,7 +306,7 @@ To work with secure HDFS and Hive metastore, you need to specify ``hdfs.authenti connect.hdfs.keytab=path to the connector keytab hdfs.namenode.principal=namenode principal -You need to create the Kafka connect principals and keytab files via Kerboros and distribute the +You need to create the Kafka connect principals and keytab files via Kerberos and distribute the keytab file to all hosts that running the connector and ensures that only the connector user has read access to the keytab file. diff --git a/pom.xml b/pom.xml index 5afa18e1f..2eb636f35 100644 --- a/pom.xml +++ b/pom.xml @@ -84,39 +84,29 @@ ${confluent.version} - org.apache.hadoop - hadoop-client - ${hadoop.version} - - - org.apache.hive - hive-cli - ${hive.version} - - - org.apache.hive - hive-common - ${hive.version} + io.confluent + kafka-connect-storage-common + ${confluent.version} - org.apache.avro - avro-mapred - ${avro.version} + io.confluent + kafka-connect-storage-core + ${confluent.version} - org.apache.parquet - parquet-column - ${parquet.version} + io.confluent + kafka-connect-storage-format + ${confluent.version} - org.apache.parquet - parquet-avro - ${parquet.version} + io.confluent + kafka-connect-storage-partitioner + ${confluent.version} - commons-io - commons-io - ${commons-io.version} + io.confluent + kafka-connect-storage-wal + ${confluent.version} junit @@ -207,6 +197,7 @@ src/assembly/development.xml src/assembly/package.xml + false diff --git a/src/assembly/development.xml b/src/assembly/development.xml index 498a914a2..96cf78082 100644 --- a/src/assembly/development.xml +++ b/src/assembly/development.xml @@ -19,7 +19,6 @@ shipped with Confluent platform and other dependencies such as Hadoop and Avro. This allows correctly setup of CLASSPATH in kafka-run-class.sh when running kafka connect hdfs connector. --> - org.apache.kafka:connect-api org.mortbay.jetty:* com.sun.jersey:* org.eclipse.jetty.aggregate:jetty-all diff --git a/src/assembly/package.xml b/src/assembly/package.xml index 3d9dae049..449606b93 100644 --- a/src/assembly/package.xml +++ b/src/assembly/package.xml @@ -38,11 +38,16 @@ shipped with Confluent platform and other dependencies such as Hadoop and Avro. This allows correctly setup of CLASSPATH in kafka-run-class.sh when running kafka connect hdfs connector. --> - org.apache.kafka:connect-api org.mortbay.jetty:* com.sun.jersey:* org.eclipse.jetty.aggregate:jetty-all com.sun.jersey.contribs:jersey-guice + io.confluent:kafka-connect-storage-common + io.confluent:kafka-connect-storage-core + io.confluent:kafka-connect-storage-format + io.confluent:kafka-connect-storage-hive + io.confluent:kafka-connect-storage-partitioner + io.confluent:kafka-connect-storage-wal diff --git a/src/main/java/io/confluent/connect/hdfs/DataWriter.java b/src/main/java/io/confluent/connect/hdfs/DataWriter.java index 62f00c12e..9616afc28 100644 --- a/src/main/java/io/confluent/connect/hdfs/DataWriter.java +++ b/src/main/java/io/confluent/connect/hdfs/DataWriter.java @@ -29,6 +29,7 @@ import org.slf4j.LoggerFactory; import java.io.IOException; +import java.lang.reflect.InvocationTargetException; import java.net.InetAddress; import java.util.Collection; import java.util.HashMap; @@ -51,22 +52,29 @@ import io.confluent.connect.hdfs.hive.HiveMetaStore; import io.confluent.connect.hdfs.hive.HiveUtil; import io.confluent.connect.hdfs.partitioner.Partitioner; +import io.confluent.connect.hdfs.storage.HdfsStorage; import io.confluent.connect.hdfs.storage.Storage; -import io.confluent.connect.hdfs.storage.StorageFactory; +import io.confluent.connect.storage.common.StorageCommonConfig; +import io.confluent.connect.storage.format.SchemaFileReader; +import io.confluent.connect.storage.hive.HiveConfig; +import io.confluent.connect.storage.partitioner.PartitionerConfig; public class DataWriter { private static final Logger log = LoggerFactory.getLogger(DataWriter.class); private Map topicPartitionWriters; private String url; - private Storage storage; - private Configuration conf; + private HdfsStorage storage; private String topicsDir; private Format format; + private RecordWriterProvider writerProvider; + private io.confluent.connect.storage.format.RecordWriterProvider + newWriterProvider; + private io.confluent.connect.storage.format.SchemaFileReader + schemaFileReader; + private io.confluent.connect.storage.format.Format newFormat; private Set assignment; private Partitioner partitioner; - private RecordWriterProvider writerProvider; - private SchemaFileReader schemaFileReader; private Map offsets; private HdfsSinkConnectorConfig connectorConfig; private AvroData avroData; @@ -91,7 +99,7 @@ public DataWriter(HdfsSinkConnectorConfig connectorConfig, SinkTaskContext conte String hadoopConfDir = connectorConfig.getString(HdfsSinkConnectorConfig.HADOOP_CONF_DIR_CONFIG); log.info("Hadoop configuration directory {}", hadoopConfDir); - conf = new Configuration(); + Configuration conf = connectorConfig.getHadoopConfiguration(); if (!hadoopConfDir.equals("")) { conf.addResource(new Path(hadoopConfDir + "/core-site.xml")); conf.addResource(new Path(hadoopConfDir + "/hdfs-site.xml")); @@ -105,7 +113,7 @@ public DataWriter(HdfsSinkConnectorConfig connectorConfig, SinkTaskContext conte if (principalConfig == null || keytab == null) { throw new ConfigException( - "Hadoop is using Kerboros for authentication, you need to provide both a connect principal and " + "Hadoop is using Kerberos for authentication, you need to provide both a connect principal and " + "the path to the keytab of the principal."); } @@ -159,32 +167,104 @@ public void run() { } url = connectorConfig.getString(HdfsSinkConnectorConfig.HDFS_URL_CONFIG); - topicsDir = connectorConfig.getString(HdfsSinkConnectorConfig.TOPICS_DIR_CONFIG); + topicsDir = connectorConfig.getString(StorageCommonConfig.TOPICS_DIR_CONFIG); String logsDir = connectorConfig.getString(HdfsSinkConnectorConfig.LOGS_DIR_CONFIG); @SuppressWarnings("unchecked") - Class storageClass = (Class) Class - .forName(connectorConfig.getString(HdfsSinkConnectorConfig.STORAGE_CLASS_CONFIG)); - storage = StorageFactory.createStorage(storageClass, conf, url); + Class storageClass = (Class) connectorConfig + .getClass(StorageCommonConfig.STORAGE_CLASS_CONFIG); + storage = io.confluent.connect.storage.StorageFactory.createStorage( + storageClass, + HdfsSinkConnectorConfig.class, + connectorConfig, + url + ); createDir(topicsDir); createDir(topicsDir + HdfsSinkConnectorConstants.TEMPFILE_DIRECTORY); createDir(logsDir); - format = getFormat(); - writerProvider = format.getRecordWriterProvider(); - schemaFileReader = format.getSchemaFileReader(avroData); + // Try to instantiate as a new-style storage-common type class, then fall back to old-style with + // no parameters + try { + Class formatClass = + (Class) connectorConfig.getClass(HdfsSinkConnectorConfig.FORMAT_CLASS_CONFIG); + newFormat = formatClass.getConstructor(HdfsStorage.class).newInstance(storage); + newWriterProvider = newFormat.getRecordWriterProvider(); + schemaFileReader = newFormat.getSchemaFileReader(); + } catch (NoSuchMethodException e) { + Class formatClass = + (Class) connectorConfig.getClass(HdfsSinkConnectorConfig.FORMAT_CLASS_CONFIG); + format = formatClass.getConstructor().newInstance(); + writerProvider = format.getRecordWriterProvider(); + final io.confluent.connect.hdfs.SchemaFileReader oldReader + = format.getSchemaFileReader(avroData); + schemaFileReader = new SchemaFileReader() { + @Override + public Schema getSchema(HdfsSinkConnectorConfig hdfsSinkConnectorConfig, Path path) { + try { + return oldReader.getSchema(hdfsSinkConnectorConfig.getHadoopConfiguration(), path); + } catch (IOException e) { + throw new ConnectException("Failed to get schema", e); + } + } + + @Override + public Iterator iterator() { + throw new UnsupportedOperationException(); + } - partitioner = createPartitioner(connectorConfig); + @Override + public boolean hasNext() { + throw new UnsupportedOperationException(); + } + + @Override + public Object next() { + throw new UnsupportedOperationException(); + } + + @Override + public void remove() { + throw new UnsupportedOperationException(); + } + + @Override + public void close() throws IOException { + + } + }; + } + + partitioner = newPartitioner(connectorConfig); assignment = new HashSet<>(context.assignment()); offsets = new HashMap<>(); - hiveIntegration = connectorConfig.getBoolean(HdfsSinkConnectorConfig.HIVE_INTEGRATION_CONFIG); + hiveIntegration = connectorConfig.getBoolean(HiveConfig.HIVE_INTEGRATION_CONFIG); if (hiveIntegration) { - hiveDatabase = connectorConfig.getString(HdfsSinkConnectorConfig.HIVE_DATABASE_CONFIG); + hiveDatabase = connectorConfig.getString(HiveConfig.HIVE_DATABASE_CONFIG); hiveMetaStore = new HiveMetaStore(conf, connectorConfig); - hive = format.getHiveUtil(connectorConfig, avroData, hiveMetaStore); + if (format != null) { + hive = format.getHiveUtil(connectorConfig, hiveMetaStore); + } else if (newFormat != null) { + final io.confluent.connect.storage.hive.HiveUtil newHiveUtil + = newFormat.getHiveFactory().createHiveUtil(connectorConfig, hiveMetaStore); + hive = new HiveUtil(connectorConfig, hiveMetaStore) { + @Override + public void createTable(String database, String tableName, Schema schema, + Partitioner partitioner) { + newHiveUtil.createTable(database, tableName, schema, partitioner); + } + + @Override + public void alterSchema(String database, String tableName, Schema schema) { + newHiveUtil.alterSchema(database, tableName, schema); + } + }; + } else { + throw new ConnectException("One of old or new format classes must be provided"); + } executorService = Executors.newSingleThreadExecutor(); hiveUpdateFutures = new LinkedList<>(); } @@ -192,11 +272,24 @@ public void run() { topicPartitionWriters = new HashMap<>(); for (TopicPartition tp: assignment) { TopicPartitionWriter topicPartitionWriter = new TopicPartitionWriter( - tp, storage, writerProvider, partitioner, connectorConfig, context, avroData, hiveMetaStore, hive, schemaFileReader, executorService, - hiveUpdateFutures); + tp, + storage, + writerProvider, + newWriterProvider, + partitioner, + connectorConfig, + context, + avroData, + hiveMetaStore, + hive, + schemaFileReader, + executorService, + hiveUpdateFutures + ); topicPartitionWriters.put(tp, topicPartitionWriter); } - } catch (ClassNotFoundException | IllegalAccessException | InstantiationException e) { + } catch (ClassNotFoundException | IllegalAccessException | InstantiationException | + InvocationTargetException | NoSuchMethodException e) { throw new ConnectException("Reflection exception: ", e); } catch (IOException e) { throw new ConnectException(e); @@ -251,7 +344,12 @@ public void syncWithHive() throws ConnectException { CommittedFileFilter filter = new TopicCommittedFileFilter(topic); FileStatus fileStatusWithMaxOffset = FileUtils.fileStatusWithMaxOffset(storage, new Path(topicDir), filter); if (fileStatusWithMaxOffset != null) { - Schema latestSchema = schemaFileReader.getSchema(conf, fileStatusWithMaxOffset.getPath()); + final Path path = fileStatusWithMaxOffset.getPath(); + final Schema latestSchema; + latestSchema = schemaFileReader.getSchema( + connectorConfig, + path + ); hive.createTable(hiveDatabase, topic, latestSchema, partitioner); List partitions = hiveMetaStore.listPartitions(hiveDatabase, topic, (short) -1); FileStatus[] statuses = FileUtils.getDirectories(storage, new Path(topicDir)); @@ -273,8 +371,20 @@ public void open(Collection partitions) { assignment = new HashSet<>(partitions); for (TopicPartition tp: assignment) { TopicPartitionWriter topicPartitionWriter = new TopicPartitionWriter( - tp, storage, writerProvider, partitioner, connectorConfig, context, avroData, - hiveMetaStore, hive, schemaFileReader, executorService, hiveUpdateFutures); + tp, + storage, + writerProvider, + newWriterProvider, + partitioner, + connectorConfig, + context, + avroData, + hiveMetaStore, + hive, + schemaFileReader, + executorService, + hiveUpdateFutures + ); topicPartitionWriters.put(tp, topicPartitionWriter); // We need to immediately start recovery to ensure we pause consumption of messages for the // assigned topics while we try to recover offsets and rewind. @@ -282,7 +392,7 @@ public void open(Collection partitions) { } } - public void close(Collection partitions) { + public void close() { // Close any writers we have. We may get assigned the same partitions and end up duplicating // some effort since we'll have to reprocess those messages. It may be possible to hold on to // the TopicPartitionWriter and continue to use the temp file, but this can get significantly @@ -322,11 +432,8 @@ public void stop() { } } - try { - storage.close(); - } catch (IOException e) { - throw new ConnectException(e); - } + storage.close(); + if (ticketRenewThread != null) { synchronized (this) { isRunning = false; @@ -354,7 +461,7 @@ public Storage getStorage() { return storage; } - public Map> getWriters(TopicPartition tp) { + Map getWriters(TopicPartition tp) { return topicPartitionWriters.get(tp).getWriters(); } @@ -363,16 +470,23 @@ public Map getTempFileNames(TopicPartition tp) { return topicPartitionWriter.getTempFiles(); } - private void createDir(String dir) throws IOException { + private void createDir(String dir) { String path = url + "/" + dir; if (!storage.exists(path)) { - storage.mkdirs(path); + storage.create(path); } } - @SuppressWarnings("unchecked") - private Format getFormat() throws ClassNotFoundException, IllegalAccessException, InstantiationException{ - return ((Class) Class.forName(connectorConfig.getString(HdfsSinkConnectorConfig.FORMAT_CLASS_CONFIG))).newInstance(); + private Partitioner newPartitioner(HdfsSinkConnectorConfig config) + throws ClassNotFoundException, IllegalAccessException, InstantiationException { + + @SuppressWarnings("unchecked") + Class partitionerClass = + (Class) config.getClass(PartitionerConfig.PARTITIONER_CLASS_CONFIG); + + Partitioner partitioner = partitionerClass.newInstance(); + partitioner.configure(new HashMap<>(config.plainValues())); + return partitioner; } private String getPartitionValue(String path) { @@ -391,7 +505,7 @@ private Partitioner createPartitioner(HdfsSinkConnectorConfig config) @SuppressWarnings("unchecked") Class partitionerClasss = (Class) - Class.forName(config.getString(HdfsSinkConnectorConfig.PARTITIONER_CLASS_CONFIG)); + Class.forName(config.getString(PartitionerConfig.PARTITIONER_CLASS_CONFIG)); Map map = copyConfig(config); Partitioner partitioner = partitionerClasss.newInstance(); @@ -401,11 +515,20 @@ private Partitioner createPartitioner(HdfsSinkConnectorConfig config) private Map copyConfig(HdfsSinkConnectorConfig config) { Map map = new HashMap<>(); - map.put(HdfsSinkConnectorConfig.PARTITION_FIELD_NAME_CONFIG, config.getString(HdfsSinkConnectorConfig.PARTITION_FIELD_NAME_CONFIG)); - map.put(HdfsSinkConnectorConfig.PARTITION_DURATION_MS_CONFIG, config.getLong(HdfsSinkConnectorConfig.PARTITION_DURATION_MS_CONFIG)); - map.put(HdfsSinkConnectorConfig.PATH_FORMAT_CONFIG, config.getString(HdfsSinkConnectorConfig.PATH_FORMAT_CONFIG)); - map.put(HdfsSinkConnectorConfig.LOCALE_CONFIG, config.getString(HdfsSinkConnectorConfig.LOCALE_CONFIG)); - map.put(HdfsSinkConnectorConfig.TIMEZONE_CONFIG, config.getString(HdfsSinkConnectorConfig.TIMEZONE_CONFIG)); + map.put( + PartitionerConfig.PARTITION_FIELD_NAME_CONFIG, + config.getString(PartitionerConfig.PARTITION_FIELD_NAME_CONFIG) + ); + map.put( + PartitionerConfig.PARTITION_DURATION_MS_CONFIG, + config.getLong(PartitionerConfig.PARTITION_DURATION_MS_CONFIG) + ); + map.put( + PartitionerConfig.PATH_FORMAT_CONFIG, + config.getString(PartitionerConfig.PATH_FORMAT_CONFIG) + ); + map.put(PartitionerConfig.LOCALE_CONFIG, config.getString(PartitionerConfig.LOCALE_CONFIG)); + map.put(PartitionerConfig.TIMEZONE_CONFIG, config.getString(PartitionerConfig.TIMEZONE_CONFIG)); return map; } } diff --git a/src/main/java/io/confluent/connect/hdfs/DateTimeUtils.java b/src/main/java/io/confluent/connect/hdfs/DateTimeUtils.java index 9e338270f..42f32e61e 100644 --- a/src/main/java/io/confluent/connect/hdfs/DateTimeUtils.java +++ b/src/main/java/io/confluent/connect/hdfs/DateTimeUtils.java @@ -1,22 +1,4 @@ package io.confluent.connect.hdfs; -import org.joda.time.DateTimeZone; - -public class DateTimeUtils { - private static final long DAY_IN_MS = 24 * 60 * 60 * 1000; - - /** - * Calculates next period of periodMs after currentTimeMs starting from midnight in given timeZone. - * If the next period is in next day then 12am of next day will be returned - * @param currentTimeMs time to calculate at - * @param periodMs period in ms - * @param timeZone timezone to get midnight time - * @return timestamp in ms - */ - public static long getNextTimeAdjustedByDay(long currentTimeMs, long periodMs, DateTimeZone timeZone) { - long startOfDay = timeZone.convertLocalToUTC(timeZone.convertUTCToLocal(currentTimeMs) / DAY_IN_MS * DAY_IN_MS, true); - long nextPeriodOffset = ((currentTimeMs - startOfDay) / periodMs + 1) * periodMs; - long offset = Math.min(nextPeriodOffset, DAY_IN_MS); - return startOfDay + offset; - } -} +@Deprecated +public class DateTimeUtils extends io.confluent.connect.storage.util.DateTimeUtils {} diff --git a/src/main/java/io/confluent/connect/hdfs/FileUtils.java b/src/main/java/io/confluent/connect/hdfs/FileUtils.java index e5ae7a790..8706cbb92 100644 --- a/src/main/java/io/confluent/connect/hdfs/FileUtils.java +++ b/src/main/java/io/confluent/connect/hdfs/FileUtils.java @@ -26,6 +26,7 @@ import java.io.IOException; import java.util.ArrayList; +import java.util.List; import java.util.UUID; import java.util.regex.Matcher; @@ -45,17 +46,17 @@ public static String directoryName(String url, String topicsDir, TopicPartition return url + "/" + topicsDir + "/" + topic + "/" + partition; } - public static String fileName(String url, String topicsDir, TopicPartition topicPart, - String name) { + public static String fileName( + String url, + String topicsDir, + TopicPartition topicPart, + String name + ) { String topic = topicPart.topic(); int partition = topicPart.partition(); return url + "/" + topicsDir + "/" + topic + "/" + partition + "/" + name; } - public static String hiveDirectoryName(String url, String topicsDir, String topic) { - return url + "/" + topicsDir + "/" + topic + "/"; - } - public static String fileName(String url, String topicsDir, String directory, String name) { return url + "/" + topicsDir + "/" + directory + "/" + name; } @@ -64,16 +65,27 @@ public static String directoryName(String url, String topicsDir, String director return url + "/" + topicsDir + "/" + directory; } - public static String tempFileName(String url, String topicsDir, String directory, - String extension) { + public static String tempFileName( + String url, + String topicsDir, + String directory, + String extension + ) { UUID id = UUID.randomUUID(); String name = id.toString() + "_" + "tmp" + extension; return fileName(url, topicsDir, directory, name); } - public static String committedFileName(String url, String topicsDir, String directory, - TopicPartition topicPart, long startOffset, long endOffset, - String extension, String zeroPadFormat) { + public static String committedFileName( + String url, + String topicsDir, + String directory, + TopicPartition topicPart, + long startOffset, + long endOffset, + String extension, + String zeroPadFormat + ) { String topic = topicPart.topic(); int partition = topicPart.partition(); StringBuilder sb = new StringBuilder(); @@ -93,13 +105,12 @@ public static String topicDirectory(String url, String topicsDir, String topic) return url + "/" + topicsDir + "/" + topic; } - private static ArrayList traverseImpl(Storage storage, Path path, PathFilter filter) - throws IOException { + private static ArrayList traverseImpl(Storage storage, Path path, PathFilter filter) { if (!storage.exists(path.toString())) { return new ArrayList<>(); } ArrayList result = new ArrayList<>(); - FileStatus[] statuses = storage.listStatus(path.toString()); + List statuses = storage.list(path.toString()); for (FileStatus status : statuses) { if (status.isDirectory()) { result.addAll(traverseImpl(storage, status.getPath(), filter)); @@ -118,14 +129,13 @@ public static FileStatus[] traverse(Storage storage, Path path, PathFilter filte return result.toArray(new FileStatus[result.size()]); } - public static FileStatus fileStatusWithMaxOffset(Storage storage, Path path, - CommittedFileFilter filter) throws IOException { + public static FileStatus fileStatusWithMaxOffset(Storage storage, Path path, CommittedFileFilter filter) { if (!storage.exists(path.toString())) { return null; } long maxOffset = -1L; FileStatus fileStatusWithMaxOffset = null; - FileStatus[] statuses = storage.listStatus(path.toString()); + List statuses = storage.list(path.toString()); for (FileStatus status : statuses) { if (status.isDirectory()) { FileStatus fileStatus = fileStatusWithMaxOffset(storage, status.getPath(), filter); @@ -160,14 +170,13 @@ public static long extractOffset(String filename) { return Long.parseLong(m.group(HdfsSinkConnectorConstants.PATTERN_END_OFFSET_GROUP)); } - private static ArrayList getDirectoriesImpl(Storage storage, Path path) - throws IOException { - FileStatus[] statuses = storage.listStatus(path.toString()); + private static ArrayList getDirectoriesImpl(Storage storage, Path path) { + List statuses = storage.list(path.toString()); ArrayList result = new ArrayList<>(); for (FileStatus status : statuses) { if (status.isDirectory()) { int count = 0; - FileStatus[] fileStatuses = storage.listStatus(status.getPath().toString()); + List fileStatuses = storage.list(status.getPath().toString()); for (FileStatus fileStatus : fileStatuses) { if (fileStatus.isDirectory()) { result.addAll(getDirectoriesImpl(storage, fileStatus.getPath())); @@ -175,7 +184,7 @@ private static ArrayList getDirectoriesImpl(Storage storage, Path pa count++; } } - if (count == fileStatuses.length) { + if (count == fileStatuses.size()) { result.add(status); } } diff --git a/src/main/java/io/confluent/connect/hdfs/Format.java b/src/main/java/io/confluent/connect/hdfs/Format.java index eb4c6029e..c59f93d2a 100644 --- a/src/main/java/io/confluent/connect/hdfs/Format.java +++ b/src/main/java/io/confluent/connect/hdfs/Format.java @@ -18,8 +18,10 @@ import io.confluent.connect.hdfs.hive.HiveMetaStore; import io.confluent.connect.hdfs.hive.HiveUtil; +// NOTE: DO NOT add or modify this class as it is maintained for compatibility +@Deprecated public interface Format { RecordWriterProvider getRecordWriterProvider(); SchemaFileReader getSchemaFileReader(AvroData avroData); - HiveUtil getHiveUtil(HdfsSinkConnectorConfig config, AvroData avroData, HiveMetaStore hiveMetaStore); + HiveUtil getHiveUtil(HdfsSinkConnectorConfig config, HiveMetaStore hiveMetaStore); } diff --git a/src/main/java/io/confluent/connect/hdfs/HdfsSinkConnectorConfig.java b/src/main/java/io/confluent/connect/hdfs/HdfsSinkConnectorConfig.java index fac3e01cb..9c0405100 100644 --- a/src/main/java/io/confluent/connect/hdfs/HdfsSinkConnectorConfig.java +++ b/src/main/java/io/confluent/connect/hdfs/HdfsSinkConnectorConfig.java @@ -14,6 +14,7 @@ package io.confluent.connect.hdfs; +import org.apache.hadoop.conf.Configuration; import org.apache.kafka.common.config.AbstractConfig; import org.apache.kafka.common.config.ConfigDef; import org.apache.kafka.common.config.ConfigDef.Importance; @@ -22,25 +23,30 @@ import org.apache.kafka.common.config.ConfigException; import java.util.Arrays; +import java.util.HashMap; +import java.util.HashSet; import java.util.LinkedList; import java.util.List; import java.util.Map; +import java.util.Set; -import io.confluent.connect.hdfs.partitioner.DailyPartitioner; -import io.confluent.connect.hdfs.partitioner.DefaultPartitioner; -import io.confluent.connect.hdfs.partitioner.FieldPartitioner; -import io.confluent.connect.hdfs.partitioner.HourlyPartitioner; -import io.confluent.connect.hdfs.partitioner.Partitioner; -import io.confluent.connect.hdfs.partitioner.TimeBasedPartitioner; +import io.confluent.connect.storage.StorageSinkConnectorConfig; +import io.confluent.connect.storage.common.ComposableConfig; +import io.confluent.connect.storage.common.StorageCommonConfig; +import io.confluent.connect.storage.hive.HiveConfig; +import io.confluent.connect.storage.partitioner.PartitionerConfig; -public class HdfsSinkConnectorConfig extends AbstractConfig { +public class HdfsSinkConnectorConfig extends StorageSinkConnectorConfig { // HDFS Group + // This config is deprecated and will be removed in future releases. Use store.url instead. public static final String HDFS_URL_CONFIG = "hdfs.url"; - private static final String HDFS_URL_DOC = + public static final String HDFS_URL_DOC = "The HDFS connection URL. This configuration has the format of hdfs:://hostname:port and " - + "specifies the HDFS to export data to."; - private static final String HDFS_URL_DISPLAY = "HDFS URL"; + + "specifies the HDFS to export data to. This property is deprecated and will be removed in future releases. " + + "Use ``store.url`` instead."; + public static final String HDFS_URL_DEFAULT = null; + public static final String HDFS_URL_DISPLAY = "HDFS URL"; public static final String HADOOP_CONF_DIR_CONFIG = "hadoop.conf.dir"; private static final String HADOOP_CONF_DIR_DOC = @@ -54,53 +60,11 @@ public class HdfsSinkConnectorConfig extends AbstractConfig { public static final String HADOOP_HOME_DEFAULT = ""; private static final String HADOOP_HOME_DISPLAY = "Hadoop home directory"; - public static final String TOPICS_DIR_CONFIG = "topics.dir"; - private static final String TOPICS_DIR_DOC = - "Top level HDFS directory to store the data ingested from Kafka."; - public static final String TOPICS_DIR_DEFAULT = "topics"; - private static final String TOPICS_DIR_DISPLAY = "Topics directory"; - public static final String LOGS_DIR_CONFIG = "logs.dir"; - private static final String LOGS_DIR_DOC = - "Top level HDFS directory to store the write ahead logs."; + public static final String LOGS_DIR_DOC = + "Top level directory to store the write ahead logs."; public static final String LOGS_DIR_DEFAULT = "logs"; - private static final String LOGS_DIR_DISPLAY = "Logs directory"; - - public static final String FORMAT_CLASS_CONFIG = "format.class"; - private static final String FORMAT_CLASS_DOC = - "The format class to use when writing data to HDFS. "; - public static final String FORMAT_CLASS_DEFAULT = "io.confluent.connect.hdfs.avro.AvroFormat"; - private static final String FORMAT_CLASS_DISPLAY = "Format class"; - - // Hive group - public static final String HIVE_INTEGRATION_CONFIG = "hive.integration"; - private static final String HIVE_INTEGRATION_DOC = - "Configuration indicating whether to integrate with Hive when running the connector."; - public static final boolean HIVE_INTEGRATION_DEFAULT = false; - private static final String HIVE_INTEGRATION_DISPLAY = "Hive Integration"; - - public static final String HIVE_METASTORE_URIS_CONFIG = "hive.metastore.uris"; - private static final String HIVE_METASTORE_URIS_DOC = - "The Hive metastore URIs, can be IP address or fully-qualified domain name " - + "and port of the metastore host."; - public static final String HIVE_METASTORE_URIS_DEFAULT = ""; - private static final String HIVE_METASTORE_URIS_DISPLAY = "Hive Metastore URIs"; - - public static final String HIVE_CONF_DIR_CONFIG = "hive.conf.dir"; - private static final String HIVE_CONF_DIR_DOC = "Hive configuration directory"; - public static final String HIVE_CONF_DIR_DEFAULT = ""; - private static final String HIVE_CONF_DIR_DISPLAY = "Hive configuration directory."; - - public static final String HIVE_HOME_CONFIG = "hive.home"; - private static final String HIVE_HOME_DOC = "Hive home directory."; - public static final String HIVE_HOME_DEFAULT = ""; - private static final String HIVE_HOME_DISPLAY = "Hive home directory"; - - public static final String HIVE_DATABASE_CONFIG = "hive.database"; - private static final String HIVE_DATABASE_DOC = - "The database to use when the connector creates tables in Hive."; - private static final String HIVE_DATABASE_DEFAULT = "default"; - private static final String HIVE_DATABASE_DISPLAY = "Hive database"; + public static final String LOGS_DIR_DISPLAY = "Logs directory"; // Security group public static final String HDFS_AUTHENTICATION_KERBEROS_CONFIG = "hdfs.authentication.kerberos"; @@ -133,224 +97,221 @@ public class HdfsSinkConnectorConfig extends AbstractConfig { public static final long KERBEROS_TICKET_RENEW_PERIOD_MS_DEFAULT = 60000 * 60; private static final String KERBEROS_TICKET_RENEW_PERIOD_MS_DISPLAY = "Kerberos Ticket Renew Period (ms)"; - // Connector group - public static final String FLUSH_SIZE_CONFIG = "flush.size"; - private static final String FLUSH_SIZE_DOC = - "Number of records written to HDFS before invoking file commits."; - private static final String FLUSH_SIZE_DISPLAY = "Flush Size"; - - public static final String ROTATE_INTERVAL_MS_CONFIG = "rotate.interval.ms"; - private static final String ROTATE_INTERVAL_MS_DOC = - "The time interval in milliseconds to invoke file commits. This configuration ensures that " - + "file commits are invoked every configured interval. This configuration is useful when data " - + "ingestion rate is low and the connector didn't write enough messages to commit files." - + "The default value -1 means that this feature is disabled."; - private static final long ROTATE_INTERVAL_MS_DEFAULT = -1L; - private static final String ROTATE_INTERVAL_MS_DISPLAY = "Rotate Interval (ms)"; - - public static final String ROTATE_SCHEDULE_INTERVAL_MS_CONFIG = "rotate.schedule.interval.ms"; - private static final String ROTATE_SCHEDULE_INTERVAL_MS_DOC = - "The time interval in milliseconds to periodically invoke file commits. This configuration ensures that " - + "file commits are invoked every configured interval. Time of commit will be adjusted to 00:00 of selected timezone. " - + "Commit will be performed at scheduled time regardless previous commit time or number of messages. " - + "This configuration is useful when you have to commit your data based on current server time, like at the beginning of every hour. " - + "The default value -1 means that this feature is disabled."; - private static final long ROTATE_SCHEDULE_INTERVAL_MS_DEFAULT = -1L; - private static final String ROTATE_SCHEDULE_INTERVAL_MS_DISPLAY = "Rotate Schedule Interval (ms)"; - - public static final String RETRY_BACKOFF_CONFIG = "retry.backoff.ms"; - private static final String RETRY_BACKOFF_DOC = - "The retry backoff in milliseconds. This config is used to " - + "notify Kafka connect to retry delivering a message batch or performing recovery in case " - + "of transient exceptions."; - public static final long RETRY_BACKOFF_DEFAULT = 5000L; - private static final String RETRY_BACKOFF_DISPLAY = "Retry Backoff (ms)"; - - public static final String SHUTDOWN_TIMEOUT_CONFIG = "shutdown.timeout.ms"; - private static final String SHUTDOWN_TIMEOUT_DOC = - "Clean shutdown timeout. This makes sure that asynchronous Hive metastore updates are " - + "completed during connector shutdown."; - private static final long SHUTDOWN_TIMEOUT_DEFAULT = 3000L; - private static final String SHUTDOWN_TIMEOUT_DISPLAY = "Shutdown Timeout (ms)"; - - public static final String PARTITIONER_CLASS_CONFIG = "partitioner.class"; - private static final String PARTITIONER_CLASS_DOC = - "The partitioner to use when writing data to HDFS. You can use ``DefaultPartitioner``, " - + "which preserves the Kafka partitions; ``FieldPartitioner``, which partitions the data to " - + "different directories according to the value of the partitioning field specified " - + "in ``partition.field.name``; ``TimebasedPartitioner``, which partitions data " - + "according to the time ingested to HDFS."; - public static final String PARTITIONER_CLASS_DEFAULT = - "io.confluent.connect.hdfs.partitioner.DefaultPartitioner"; - private static final String PARTITIONER_CLASS_DISPLAY = "Partitioner Class"; - - public static final String PARTITION_FIELD_NAME_CONFIG = "partition.field.name"; - private static final String PARTITION_FIELD_NAME_DOC = - "The name of the partitioning field when FieldPartitioner is used."; - public static final String PARTITION_FIELD_NAME_DEFAULT = ""; - public static final String PARTITION_FIELD_NAME_DISPLAY = "Partition Field Name"; - - public static final String PARTITION_DURATION_MS_CONFIG = "partition.duration.ms"; - private static final String PARTITION_DURATION_MS_DOC = - "The duration of a partition milliseconds used by ``TimeBasedPartitioner``. " - + "The default value -1 means that we are not using ``TimebasedPartitioner``."; - public static final long PARTITION_DURATION_MS_DEFAULT = -1L; - private static final String PARTITION_DURATION_MS_DISPLAY = "Partition Duration (ms)"; - - public static final String PATH_FORMAT_CONFIG = "path.format"; - private static final String PATH_FORMAT_DOC = - "This configuration is used to set the format of the data directories when partitioning with " - + "``TimeBasedPartitioner``. The format set in this configuration converts the Unix timestamp " - + "to proper directories strings. For example, if you set " - + "``path.format='year'=YYYY/'month'=MM/'day'=dd/'hour'=HH/``, the data directories will have" - + " the format ``/year=2015/month=12/day=07/hour=15``."; - public static final String PATH_FORMAT_DEFAULT = ""; - private static final String PATH_FORMAT_DISPLAY = "Path Format"; - - public static final String LOCALE_CONFIG = "locale"; - private static final String LOCALE_DOC = - "The locale to use when partitioning with ``TimeBasedPartitioner``."; - public static final String LOCALE_DEFAULT = ""; - private static final String LOCALE_DISPLAY = "Locale"; - - public static final String TIMEZONE_CONFIG = "timezone"; - private static final String TIMEZONE_DOC = - "The timezone to use when partitioning with ``TimeBasedPartitioner``."; - public static final String TIMEZONE_DEFAULT = ""; - private static final String TIMEZONE_DISPLAY = "Timezone"; - - public static final String FILENAME_OFFSET_ZERO_PAD_WIDTH_CONFIG = "filename.offset.zero.pad.width"; - private static final String FILENAME_OFFSET_ZERO_PAD_WIDTH_DOC = - "Width to zero pad offsets in HDFS filenames to if the offsets is too short in order to " - + "provide fixed width filenames that can be ordered by simple lexicographic sorting."; - public static final int FILENAME_OFFSET_ZERO_PAD_WIDTH_DEFAULT = 10; - private static final String FILENAME_OFFSET_ZERO_PAD_WIDTH_DISPLAY = "Filename Offset Zero Pad Width"; - - // Schema group - public static final String SCHEMA_COMPATIBILITY_CONFIG = "schema.compatibility"; - private static final String SCHEMA_COMPATIBILITY_DOC = - "The schema compatibility rule to use when the connector is observing schema changes. The " - + "supported configurations are NONE, BACKWARD, FORWARD and FULL."; - private static final String SCHEMA_COMPATIBILITY_DEFAULT = "NONE"; - private static final String SCHEMA_COMPATIBILITY_DISPLAY = "Schema Compatibility"; - - public static final String SCHEMA_CACHE_SIZE_CONFIG = "schema.cache.size"; - private static final String SCHEMA_CACHE_SIZE_DOC = - "The size of the schema cache used in the Avro converter."; - public static final int SCHEMA_CACHE_SIZE_DEFAULT = 1000; - private static final String SCHEMA_CACHE_SIZE_DISPLAY = "Schema Cache Size"; - - // Internal group - public static final String STORAGE_CLASS_CONFIG = "storage.class"; - private static final String STORAGE_CLASS_DOC = - "The underlying storage layer. The default is HDFS."; + // Need to just set the default public static final String STORAGE_CLASS_DEFAULT = "io.confluent.connect.hdfs.storage.HdfsStorage"; - private static final String STORAGE_CLASS_DISPLAY = "Storage Class"; - - public static final String HDFS_GROUP = "HDFS"; - public static final String HIVE_GROUP = "Hive"; - public static final String SECURITY_GROUP = "Security"; - public static final String SCHEMA_GROUP = "Schema"; - public static final String CONNECTOR_GROUP = "Connector"; - public static final String INTERNAL_GROUP = "Internal"; - private static final ConfigDef.Recommender hiveIntegrationDependentsRecommender = new BooleanParentRecommender(HIVE_INTEGRATION_CONFIG); private static final ConfigDef.Recommender hdfsAuthenticationKerberosDependentsRecommender = new BooleanParentRecommender(HDFS_AUTHENTICATION_KERBEROS_CONFIG); - private static final ConfigDef.Recommender partitionerClassDependentsRecommender = new PartitionerClassDependentsRecommender(); - private static final ConfigDef.Recommender schemaCompatibilityRecommender = new SchemaCompatibilityRecommender(); - private static ConfigDef config = new ConfigDef(); + private final String name; + private Configuration hadoopConfig; - static { + private final StorageCommonConfig commonConfig; + private final HiveConfig hiveConfig; + private final PartitionerConfig partitionerConfig; + + private final Map propertyToConfig = new HashMap<>(); + private final Set allConfigs = new HashSet<>(); + static { // Define HDFS configuration group - config.define(HDFS_URL_CONFIG, Type.STRING, Importance.HIGH, HDFS_URL_DOC, HDFS_GROUP, 1, Width.MEDIUM, HDFS_URL_DISPLAY) - .define(HADOOP_CONF_DIR_CONFIG, Type.STRING, HADOOP_CONF_DIR_DEFAULT, Importance.HIGH, HADOOP_CONF_DIR_DOC, HDFS_GROUP, 2, Width.MEDIUM, HADOOP_CONF_DIR_DISPLAY) - .define(HADOOP_HOME_CONFIG, Type.STRING, HADOOP_HOME_DEFAULT, Importance.HIGH, HADOOP_HOME_DOC, HDFS_GROUP, 3, Width.SHORT, HADOOP_HOME_DISPLAY) - .define(TOPICS_DIR_CONFIG, Type.STRING, TOPICS_DIR_DEFAULT, Importance.HIGH, TOPICS_DIR_DOC, HDFS_GROUP, 4, Width.SHORT, TOPICS_DIR_DISPLAY) - .define(LOGS_DIR_CONFIG, Type.STRING, LOGS_DIR_DEFAULT, Importance.HIGH, LOGS_DIR_DOC, HDFS_GROUP, 5, Width.SHORT, LOGS_DIR_DISPLAY) - .define(FORMAT_CLASS_CONFIG, Type.STRING, FORMAT_CLASS_DEFAULT, Importance.HIGH, FORMAT_CLASS_DOC, HDFS_GROUP, 6, Width.SHORT, FORMAT_CLASS_DISPLAY); - - // Define Hive configuration group - config.define(HIVE_INTEGRATION_CONFIG, Type.BOOLEAN, HIVE_INTEGRATION_DEFAULT, Importance.HIGH, HIVE_INTEGRATION_DOC, HIVE_GROUP, 1, Width.SHORT, HIVE_INTEGRATION_DISPLAY, - Arrays.asList(HIVE_METASTORE_URIS_CONFIG, HIVE_CONF_DIR_CONFIG, HIVE_HOME_CONFIG, HIVE_DATABASE_CONFIG, SCHEMA_COMPATIBILITY_CONFIG)) - .define(HIVE_METASTORE_URIS_CONFIG, Type.STRING, HIVE_METASTORE_URIS_DEFAULT, Importance.HIGH, HIVE_METASTORE_URIS_DOC, HIVE_GROUP, 2, Width.MEDIUM, - HIVE_METASTORE_URIS_DISPLAY, hiveIntegrationDependentsRecommender) - .define(HIVE_CONF_DIR_CONFIG, Type.STRING, HIVE_CONF_DIR_DEFAULT, Importance.HIGH, HIVE_CONF_DIR_DOC, HIVE_GROUP, 3, Width.MEDIUM, HIVE_CONF_DIR_DISPLAY, hiveIntegrationDependentsRecommender) - .define(HIVE_HOME_CONFIG, Type.STRING, HIVE_HOME_DEFAULT, Importance.HIGH, HIVE_HOME_DOC, HIVE_GROUP, 4, Width.MEDIUM, HIVE_HOME_DISPLAY, hiveIntegrationDependentsRecommender) - .define(HIVE_DATABASE_CONFIG, Type.STRING, HIVE_DATABASE_DEFAULT, Importance.HIGH, HIVE_DATABASE_DOC, HIVE_GROUP, 5, Width.SHORT, HIVE_DATABASE_DISPLAY, hiveIntegrationDependentsRecommender); - - // Define Security configuration group - config.define(HDFS_AUTHENTICATION_KERBEROS_CONFIG, Type.BOOLEAN, HDFS_AUTHENTICATION_KERBEROS_DEFAULT, Importance.HIGH, HDFS_AUTHENTICATION_KERBEROS_DOC, - SECURITY_GROUP, 1, Width.SHORT, HDFS_AUTHENTICATION_KERBEROS_DISPLAY, - Arrays.asList(CONNECT_HDFS_PRINCIPAL_CONFIG, CONNECT_HDFS_KEYTAB_CONFIG, HDFS_NAMENODE_PRINCIPAL_CONFIG, KERBEROS_TICKET_RENEW_PERIOD_MS_CONFIG)) - .define(CONNECT_HDFS_PRINCIPAL_CONFIG, Type.STRING, CONNECT_HDFS_PRINCIPAL_DEFAULT, Importance.HIGH, CONNECT_HDFS_PRINCIPAL_DOC, - SECURITY_GROUP, 2, Width.MEDIUM, CONNECT_HDFS_PRINCIPAL_DISPLAY, hdfsAuthenticationKerberosDependentsRecommender) - .define(CONNECT_HDFS_KEYTAB_CONFIG, Type.STRING, CONNECT_HDFS_KEYTAB_DEFAULT, Importance.HIGH, CONNECT_HDFS_KEYTAB_DOC, - SECURITY_GROUP, 3, Width.MEDIUM, CONNECT_HDFS_KEYTAB_DISPLAY, hdfsAuthenticationKerberosDependentsRecommender) - .define(HDFS_NAMENODE_PRINCIPAL_CONFIG, Type.STRING, HDFS_NAMENODE_PRINCIPAL_DEFAULT, Importance.HIGH, HDFS_NAMENODE_PRINCIPAL_DOC, - SECURITY_GROUP, 4, Width.MEDIUM, HDFS_NAMENODE_PRINCIPAL_DISPLAY, hdfsAuthenticationKerberosDependentsRecommender) - .define(KERBEROS_TICKET_RENEW_PERIOD_MS_CONFIG, Type.LONG, KERBEROS_TICKET_RENEW_PERIOD_MS_DEFAULT, Importance.LOW, KERBEROS_TICKET_RENEW_PERIOD_MS_DOC, - SECURITY_GROUP, 5, Width.SHORT, KERBEROS_TICKET_RENEW_PERIOD_MS_DISPLAY, hdfsAuthenticationKerberosDependentsRecommender); - - // Define Schema configuration group - config.define(SCHEMA_COMPATIBILITY_CONFIG, Type.STRING, SCHEMA_COMPATIBILITY_DEFAULT, Importance.HIGH, SCHEMA_COMPATIBILITY_DOC, SCHEMA_GROUP, 1, Width.SHORT, - SCHEMA_COMPATIBILITY_DISPLAY, schemaCompatibilityRecommender) - .define(SCHEMA_CACHE_SIZE_CONFIG, Type.INT, SCHEMA_CACHE_SIZE_DEFAULT, Importance.LOW, SCHEMA_CACHE_SIZE_DOC, SCHEMA_GROUP, 2, Width.SHORT, SCHEMA_CACHE_SIZE_DISPLAY); - - // Define Connector configuration group - config.define(FLUSH_SIZE_CONFIG, Type.INT, Importance.HIGH, FLUSH_SIZE_DOC, CONNECTOR_GROUP, 1, Width.SHORT, FLUSH_SIZE_DISPLAY) - .define(ROTATE_INTERVAL_MS_CONFIG, Type.LONG, ROTATE_INTERVAL_MS_DEFAULT, Importance.HIGH, ROTATE_INTERVAL_MS_DOC, CONNECTOR_GROUP, 2, Width.SHORT, ROTATE_INTERVAL_MS_DISPLAY) - .define(ROTATE_SCHEDULE_INTERVAL_MS_CONFIG, Type.LONG, ROTATE_SCHEDULE_INTERVAL_MS_DEFAULT, Importance.MEDIUM, ROTATE_SCHEDULE_INTERVAL_MS_DOC, CONNECTOR_GROUP, 3, Width.SHORT, ROTATE_SCHEDULE_INTERVAL_MS_DISPLAY) - .define(RETRY_BACKOFF_CONFIG, Type.LONG, RETRY_BACKOFF_DEFAULT, Importance.LOW, RETRY_BACKOFF_DOC, CONNECTOR_GROUP, 4, Width.SHORT, RETRY_BACKOFF_DISPLAY) - .define(SHUTDOWN_TIMEOUT_CONFIG, Type.LONG, SHUTDOWN_TIMEOUT_DEFAULT, Importance.MEDIUM, SHUTDOWN_TIMEOUT_DOC, CONNECTOR_GROUP, 5, Width.SHORT, SHUTDOWN_TIMEOUT_DISPLAY) - .define(PARTITIONER_CLASS_CONFIG, Type.STRING, PARTITIONER_CLASS_DEFAULT, Importance.HIGH, PARTITIONER_CLASS_DOC, CONNECTOR_GROUP, 6, Width.LONG, PARTITIONER_CLASS_DISPLAY, - Arrays.asList(PARTITION_FIELD_NAME_CONFIG, PARTITION_DURATION_MS_CONFIG, PATH_FORMAT_CONFIG, LOCALE_CONFIG, TIMEZONE_CONFIG)) - .define(PARTITION_FIELD_NAME_CONFIG, Type.STRING, PARTITION_FIELD_NAME_DEFAULT, Importance.MEDIUM, PARTITION_FIELD_NAME_DOC, CONNECTOR_GROUP, 7, Width.MEDIUM, - PARTITION_FIELD_NAME_DISPLAY, partitionerClassDependentsRecommender) - .define(PARTITION_DURATION_MS_CONFIG, Type.LONG, PARTITION_DURATION_MS_DEFAULT, Importance.MEDIUM, PARTITION_DURATION_MS_DOC, CONNECTOR_GROUP, 8, Width.SHORT, - PARTITION_DURATION_MS_DISPLAY, partitionerClassDependentsRecommender) - .define(PATH_FORMAT_CONFIG, Type.STRING, PATH_FORMAT_DEFAULT, Importance.MEDIUM, PATH_FORMAT_DOC, CONNECTOR_GROUP, 9, Width.LONG, PATH_FORMAT_DISPLAY, - partitionerClassDependentsRecommender) - .define(LOCALE_CONFIG, Type.STRING, LOCALE_DEFAULT, Importance.MEDIUM, LOCALE_DOC, CONNECTOR_GROUP, 10, Width.MEDIUM, LOCALE_DISPLAY, partitionerClassDependentsRecommender) - .define(TIMEZONE_CONFIG, Type.STRING, TIMEZONE_DEFAULT, Importance.MEDIUM, TIMEZONE_DOC, CONNECTOR_GROUP, 11, Width.MEDIUM, TIMEZONE_DISPLAY, partitionerClassDependentsRecommender) - .define(FILENAME_OFFSET_ZERO_PAD_WIDTH_CONFIG, Type.INT, FILENAME_OFFSET_ZERO_PAD_WIDTH_DEFAULT, ConfigDef.Range.atLeast(0), Importance.LOW, FILENAME_OFFSET_ZERO_PAD_WIDTH_DOC, - CONNECTOR_GROUP, 12, Width.SHORT, FILENAME_OFFSET_ZERO_PAD_WIDTH_DISPLAY); - - // Define Internal configuration group - config.define(STORAGE_CLASS_CONFIG, Type.STRING, STORAGE_CLASS_DEFAULT, Importance.LOW, STORAGE_CLASS_DOC, INTERNAL_GROUP, 1, Width.MEDIUM, STORAGE_CLASS_DISPLAY); + { + final String group = "HDFS"; + int orderInGroup = 0; + + // HDFS_URL_CONFIG property is retained for backwards compatibility with HDFS connector and + // will be removed in future versions. + CONFIG_DEF.define( + HDFS_URL_CONFIG, + Type.STRING, + HDFS_URL_DEFAULT, + Importance.HIGH, + HDFS_URL_DOC, + group, + ++orderInGroup, + Width.MEDIUM, + HDFS_URL_DISPLAY + ); + + CONFIG_DEF.define( + HADOOP_CONF_DIR_CONFIG, + Type.STRING, + HADOOP_CONF_DIR_DEFAULT, + Importance.HIGH, + HADOOP_CONF_DIR_DOC, + group, + ++orderInGroup, + Width.MEDIUM, + HADOOP_CONF_DIR_DISPLAY + ); + + CONFIG_DEF.define( + HADOOP_HOME_CONFIG, + Type.STRING, + HADOOP_HOME_DEFAULT, + Importance.HIGH, + HADOOP_HOME_DOC, + group, + ++orderInGroup, + Width.SHORT, + HADOOP_HOME_DISPLAY + ); + + CONFIG_DEF.define( + LOGS_DIR_CONFIG, + Type.STRING, + LOGS_DIR_DEFAULT, + Importance.HIGH, + LOGS_DIR_DOC, + group, + ++orderInGroup, + Width.SHORT, + LOGS_DIR_DISPLAY + ); + } + + { + final String group = "Security"; + int orderInGroup = 0; + // Define Security configuration group + CONFIG_DEF.define( + HDFS_AUTHENTICATION_KERBEROS_CONFIG, + Type.BOOLEAN, + HDFS_AUTHENTICATION_KERBEROS_DEFAULT, + Importance.HIGH, + HDFS_AUTHENTICATION_KERBEROS_DOC, + group, + ++orderInGroup, + Width.SHORT, + HDFS_AUTHENTICATION_KERBEROS_DISPLAY, + Arrays.asList( + CONNECT_HDFS_PRINCIPAL_CONFIG, + CONNECT_HDFS_KEYTAB_CONFIG, + HDFS_NAMENODE_PRINCIPAL_CONFIG, + KERBEROS_TICKET_RENEW_PERIOD_MS_CONFIG + ) + ); + + CONFIG_DEF.define( + CONNECT_HDFS_PRINCIPAL_CONFIG, + Type.STRING, + CONNECT_HDFS_PRINCIPAL_DEFAULT, + Importance.HIGH, + CONNECT_HDFS_PRINCIPAL_DOC, + group, + ++orderInGroup, + Width.MEDIUM, + CONNECT_HDFS_PRINCIPAL_DISPLAY, + hdfsAuthenticationKerberosDependentsRecommender + ); + + CONFIG_DEF.define( + CONNECT_HDFS_KEYTAB_CONFIG, + Type.STRING, + CONNECT_HDFS_KEYTAB_DEFAULT, + Importance.HIGH, + CONNECT_HDFS_KEYTAB_DOC, + group, + ++orderInGroup, + Width.MEDIUM, + CONNECT_HDFS_KEYTAB_DISPLAY, + hdfsAuthenticationKerberosDependentsRecommender + ); + + CONFIG_DEF.define( + HDFS_NAMENODE_PRINCIPAL_CONFIG, + Type.STRING, + HDFS_NAMENODE_PRINCIPAL_DEFAULT, + Importance.HIGH, + HDFS_NAMENODE_PRINCIPAL_DOC, + group, + ++orderInGroup, + Width.MEDIUM, + HDFS_NAMENODE_PRINCIPAL_DISPLAY, + hdfsAuthenticationKerberosDependentsRecommender + ); + + CONFIG_DEF.define( + KERBEROS_TICKET_RENEW_PERIOD_MS_CONFIG, + Type.LONG, + KERBEROS_TICKET_RENEW_PERIOD_MS_DEFAULT, + Importance.LOW, + KERBEROS_TICKET_RENEW_PERIOD_MS_DOC, + group, + ++orderInGroup, + Width.SHORT, + KERBEROS_TICKET_RENEW_PERIOD_MS_DISPLAY, + hdfsAuthenticationKerberosDependentsRecommender + ); + } + + } + + public HdfsSinkConnectorConfig(Map props) { + this(CONFIG_DEF, props); + } + + protected HdfsSinkConnectorConfig(ConfigDef configDef, Map props) { + super(configDef, props); + commonConfig = new StorageCommonConfig(originalsStrings()); + hiveConfig = new HiveConfig(originalsStrings()); + partitionerConfig = new PartitionerConfig(originalsStrings()); + this.name = parseName(originalsStrings()); + this.hadoopConfig = new Configuration(); + addToGlobal(hiveConfig); + addToGlobal(partitionerConfig); + addToGlobal(commonConfig); + addToGlobal(this); + } + + private void addToGlobal(AbstractConfig config) { + allConfigs.add(config); + addConfig(config.values(), (ComposableConfig) config); } - private static class SchemaCompatibilityRecommender extends BooleanParentRecommender { - - public SchemaCompatibilityRecommender() { - super(HIVE_INTEGRATION_CONFIG); + private void addConfig(Map parsedProps, ComposableConfig config) { + for (String key : parsedProps.keySet()) { + propertyToConfig.put(key, config); } - - @Override - public List validValues(String name, Map connectorConfigs) { - boolean hiveIntegration = (Boolean) connectorConfigs.get(parentConfigName); - if (hiveIntegration) { - return Arrays.asList("BACKWARD", "FORWARD", "FULL"); - } else { - return Arrays.asList("NONE", "BACKWARD", "FORWARD", "FULL"); - } + } + + protected static String parseName(Map props) { + String nameProp = props.get("name"); + return nameProp != null ? nameProp : "S3-sink"; + } + + public String getName() { + return name; + } + + @Override + public Object get(String key) { + ComposableConfig config = propertyToConfig.get(key); + if (config == null) { + throw new ConfigException(String.format("Unknown configuration '%s'", key)); } + return config == this ? super.get(key) : config.get(key); + } - @Override - public boolean visible(String name, Map connectorConfigs) { - return true; + public Configuration getHadoopConfiguration() { + return hadoopConfig; + } + + public Map plainValues() { + Map map = new HashMap<>(); + for (AbstractConfig config : allConfigs) { + map.putAll(config.values()); } + return map; } - + private static class BooleanParentRecommender implements ConfigDef.Recommender { - + protected String parentConfigName; - + public BooleanParentRecommender(String parentConfigName) { this.parentConfigName = parentConfigName; } - + @Override public List validValues(String name, Map connectorConfigs) { return new LinkedList<>(); @@ -362,53 +323,22 @@ public boolean visible(String name, Map connectorConfigs) { } } - private static class PartitionerClassDependentsRecommender implements ConfigDef.Recommender { - - @Override - public List validValues(String name, Map props) { - return new LinkedList<>(); - } - - @Override - public boolean visible(String name, Map connectorConfigs) { - String partitionerName = (String) connectorConfigs.get(PARTITIONER_CLASS_CONFIG); - try { - @SuppressWarnings("unchecked") - Class partitioner = (Class) Class.forName(partitionerName); - if (classNameEquals(partitionerName, DefaultPartitioner.class)) { - return false; - } else if (FieldPartitioner.class.isAssignableFrom(partitioner)) { - // subclass of FieldPartitioner - return name.equals(PARTITION_FIELD_NAME_CONFIG); - } else if (TimeBasedPartitioner.class.isAssignableFrom(partitioner)) { - // subclass of TimeBasedPartitioner - if (classNameEquals(partitionerName, DailyPartitioner.class) || classNameEquals(partitionerName, HourlyPartitioner.class)) { - return name.equals(LOCALE_CONFIG) || name.equals(TIMEZONE_CONFIG); - } else { - return name.equals(PARTITION_DURATION_MS_CONFIG) || name.equals(PATH_FORMAT_CONFIG) || name.equals(LOCALE_CONFIG) || name.equals(TIMEZONE_CONFIG); - } - } else { - throw new ConfigException("Not a valid partitioner class: " + partitionerName); - } - } catch (ClassNotFoundException e) { - throw new ConfigException("Partitioner class not found: " + partitionerName); + public static ConfigDef getConfig() { + Map everything = new HashMap<>(CONFIG_DEF.configKeys()); + everything.putAll(StorageCommonConfig.getConfig().configKeys()); + everything.putAll(PartitionerConfig.getConfig().configKeys()); + + Set blacklist = new HashSet<>(); + blacklist.add(StorageSinkConnectorConfig.ROTATE_INTERVAL_MS_CONFIG); + blacklist.add(StorageSinkConnectorConfig.ROTATE_SCHEDULE_INTERVAL_MS_CONFIG); + blacklist.add(StorageSinkConnectorConfig.SHUTDOWN_TIMEOUT_CONFIG); + + ConfigDef visible = new ConfigDef(); + for (ConfigDef.ConfigKey key : everything.values()) { + if(!blacklist.contains(key.name)) { + visible.define(key); } } - } - - private static boolean classNameEquals(String className, Class clazz) { - return className.equals(clazz.getSimpleName()) || className.equals(clazz.getCanonicalName()); - } - - public static ConfigDef getConfig() { - return config; - } - - public HdfsSinkConnectorConfig(Map props) { - super(config, props); - } - - public static void main(String[] args) { - System.out.println(config.toEnrichedRst()); + return visible; } } diff --git a/src/main/java/io/confluent/connect/hdfs/HdfsSinkTask.java b/src/main/java/io/confluent/connect/hdfs/HdfsSinkTask.java index 66af31182..9c3e1c170 100644 --- a/src/main/java/io/confluent/connect/hdfs/HdfsSinkTask.java +++ b/src/main/java/io/confluent/connect/hdfs/HdfsSinkTask.java @@ -29,8 +29,9 @@ import java.util.Set; import io.confluent.connect.avro.AvroData; -import io.confluent.connect.hdfs.schema.Compatibility; -import io.confluent.connect.hdfs.schema.SchemaUtils; +import io.confluent.connect.storage.hive.HiveConfig; +import io.confluent.connect.storage.partitioner.PartitionerConfig; +import io.confluent.connect.storage.schema.StorageSchemaCompatibility; public class HdfsSinkTask extends SinkTask { @@ -49,23 +50,23 @@ public String version() { @Override public void start(Map props) { - Set assignment = context.assignment();; + Set assignment = context.assignment(); try { HdfsSinkConnectorConfig connectorConfig = new HdfsSinkConnectorConfig(props); - boolean hiveIntegration = connectorConfig.getBoolean(HdfsSinkConnectorConfig.HIVE_INTEGRATION_CONFIG); + boolean hiveIntegration = connectorConfig.getBoolean(HiveConfig.HIVE_INTEGRATION_CONFIG); if (hiveIntegration) { - Compatibility compatibility = SchemaUtils.getCompatibility( - connectorConfig.getString(HdfsSinkConnectorConfig.SCHEMA_COMPATIBILITY_CONFIG)); - if (compatibility == Compatibility.NONE) { + StorageSchemaCompatibility compatibility = StorageSchemaCompatibility.getCompatibility( + connectorConfig.getString(HiveConfig.SCHEMA_COMPATIBILITY_CONFIG)); + if (compatibility == StorageSchemaCompatibility.NONE) { throw new ConfigException("Hive Integration requires schema compatibility to be BACKWARD, FORWARD or FULL"); } } //check that timezone it setup correctly in case of scheduled rotation if(connectorConfig.getLong(HdfsSinkConnectorConfig.ROTATE_SCHEDULE_INTERVAL_MS_CONFIG) > 0) { - String timeZoneString = connectorConfig.getString(HdfsSinkConnectorConfig.TIMEZONE_CONFIG); + String timeZoneString = connectorConfig.getString(PartitionerConfig.TIMEZONE_CONFIG); if (timeZoneString.equals("")) { - throw new ConfigException(HdfsSinkConnectorConfig.TIMEZONE_CONFIG, + throw new ConfigException(PartitionerConfig.TIMEZONE_CONFIG, timeZoneString, "Timezone cannot be empty when using scheduled file rotation."); } DateTimeZone.forID(timeZoneString); @@ -84,7 +85,7 @@ public void start(Map props) { log.info("Couldn't start HdfsSinkConnector:", e); log.info("Shutting down HdfsSinkConnector."); if (hdfsWriter != null) { - hdfsWriter.close(assignment); + hdfsWriter.close(); hdfsWriter.stop(); } } @@ -118,7 +119,7 @@ public void open(Collection partitions) { @Override public void close(Collection partitions) { - hdfsWriter.close(partitions); + hdfsWriter.close(); } private void recover(Set assignment) { diff --git a/src/main/java/io/confluent/connect/hdfs/OldRecordWriterWrapper.java b/src/main/java/io/confluent/connect/hdfs/OldRecordWriterWrapper.java new file mode 100644 index 000000000..6c675f5e1 --- /dev/null +++ b/src/main/java/io/confluent/connect/hdfs/OldRecordWriterWrapper.java @@ -0,0 +1,45 @@ +package io.confluent.connect.hdfs; + +import org.apache.kafka.connect.errors.ConnectException; +import org.apache.kafka.connect.sink.SinkRecord; + +import java.io.IOException; + +/** + * Wrapper for old-style RecordWriters that implements the new common RecordWriter interface and + * delegates to the old implementation. + */ +public class OldRecordWriterWrapper implements io.confluent.connect.storage.format.RecordWriter { + + // Strictly speaking RecordWriter was generic, but in practice the implementation was always + // using the SinkRecord type despite the type not being specified everywhere. + private final RecordWriter oldWriter; + + public OldRecordWriterWrapper(RecordWriter oldWriter) { + this.oldWriter = oldWriter; + } + + @Override + public void write(SinkRecord sinkRecord) { + try { + oldWriter.write(sinkRecord); + } catch (IOException e) { + throw new ConnectException("Failed to write a record to " + oldWriter, e); + } + } + + @Override + public void commit() { + // Old interface doesn't have commit + } + + + @Override + public void close() { + try { + oldWriter.close(); + } catch (IOException e) { + throw new ConnectException("Failed to close " + oldWriter, e); + } + } +} diff --git a/src/main/java/io/confluent/connect/hdfs/RecordWriter.java b/src/main/java/io/confluent/connect/hdfs/RecordWriter.java index 2249e7093..873358cf6 100644 --- a/src/main/java/io/confluent/connect/hdfs/RecordWriter.java +++ b/src/main/java/io/confluent/connect/hdfs/RecordWriter.java @@ -18,6 +18,8 @@ import java.io.IOException; +// NOTE: DO NOT add or modify this class as it is maintained for compatibility +@Deprecated public interface RecordWriter { void write(V value) throws IOException; void close() throws IOException; diff --git a/src/main/java/io/confluent/connect/hdfs/RecordWriterProvider.java b/src/main/java/io/confluent/connect/hdfs/RecordWriterProvider.java index 6507cab1a..bffd895b9 100644 --- a/src/main/java/io/confluent/connect/hdfs/RecordWriterProvider.java +++ b/src/main/java/io/confluent/connect/hdfs/RecordWriterProvider.java @@ -21,7 +21,14 @@ import io.confluent.connect.avro.AvroData; +// NOTE: DO NOT add or modify this class as it is maintained for compatibility +@Deprecated public interface RecordWriterProvider { String getExtension(); - RecordWriter getRecordWriter(Configuration conf, String fileName, SinkRecord record, AvroData avroData) throws IOException; + RecordWriter getRecordWriter( + Configuration conf, + String fileName, + SinkRecord record, + AvroData avroData + ) throws IOException; } diff --git a/src/main/java/io/confluent/connect/hdfs/SchemaFileReader.java b/src/main/java/io/confluent/connect/hdfs/SchemaFileReader.java index 11b3a1ab9..c2c3a5a0b 100644 --- a/src/main/java/io/confluent/connect/hdfs/SchemaFileReader.java +++ b/src/main/java/io/confluent/connect/hdfs/SchemaFileReader.java @@ -21,7 +21,11 @@ import java.io.IOException; import java.util.Collection; +// NOTE: DO NOT add or modify this class as it is maintained for compatibility +@Deprecated public interface SchemaFileReader { Schema getSchema(Configuration conf, Path path) throws IOException; + // NOTE: This method is no longer used and was only previously used in tests. It is safe to + // provide a dummy implementation. Collection readData(Configuration conf, Path path) throws IOException; } diff --git a/src/main/java/io/confluent/connect/hdfs/TopicPartitionWriter.java b/src/main/java/io/confluent/connect/hdfs/TopicPartitionWriter.java index 2b319fcf1..145c5b2e6 100644 --- a/src/main/java/io/confluent/connect/hdfs/TopicPartitionWriter.java +++ b/src/main/java/io/confluent/connect/hdfs/TopicPartitionWriter.java @@ -14,13 +14,12 @@ package io.confluent.connect.hdfs; -import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.Path; import org.apache.kafka.common.TopicPartition; -import org.apache.kafka.common.config.ConfigException; import org.apache.kafka.connect.data.Schema; import org.apache.kafka.connect.errors.ConnectException; +import org.apache.kafka.connect.errors.DataException; import org.apache.kafka.connect.errors.IllegalWorkerStateException; import org.apache.kafka.connect.errors.SchemaProjectorException; import org.apache.kafka.connect.sink.SinkRecord; @@ -50,16 +49,18 @@ import io.confluent.connect.hdfs.hive.HiveMetaStore; import io.confluent.connect.hdfs.hive.HiveUtil; import io.confluent.connect.hdfs.partitioner.Partitioner; -import io.confluent.connect.hdfs.schema.Compatibility; -import io.confluent.connect.hdfs.schema.SchemaUtils; -import io.confluent.connect.hdfs.storage.Storage; -import io.confluent.connect.hdfs.wal.WAL; +import io.confluent.connect.hdfs.storage.HdfsStorage; +import io.confluent.connect.storage.common.StorageCommonConfig; +import io.confluent.connect.storage.hive.HiveConfig; +import io.confluent.connect.storage.partitioner.PartitionerConfig; +import io.confluent.connect.storage.schema.StorageSchemaCompatibility; +import io.confluent.connect.storage.wal.WAL; public class TopicPartitionWriter { private static final Logger log = LoggerFactory.getLogger(TopicPartitionWriter.class); private WAL wal; private Map tempFiles; - private Map> writers; + private Map writers; private TopicPartition tp; private Partitioner partitioner; private String url; @@ -67,7 +68,7 @@ public class TopicPartitionWriter { private State state; private Queue buffer; private boolean recovered; - private Storage storage; + private HdfsStorage storage; private SinkTaskContext context; private int recordCounter; private int flushSize; @@ -75,8 +76,15 @@ public class TopicPartitionWriter { private long lastRotate; private long rotateScheduleIntervalMs; private long nextScheduledRotate; + // This is one case where we cannot simply wrap the old or new RecordWriterProvider with the + // other because they have incompatible requirements for some methods -- one requires the Hadoop + // config + extra parameters, the other requires the ConnectorConfig and doesn't get the other + // extra parameters. Instead, we have to (optionally) store one of each and use whichever one is + // non-null. private RecordWriterProvider writerProvider; - private Configuration conf; + private final io.confluent.connect.storage.format.RecordWriterProvider + newWriterProvider; + private HdfsSinkConnectorConfig connectorConfig; private AvroData avroData; private Set appended; private long offset; @@ -84,9 +92,8 @@ public class TopicPartitionWriter { private Map offsets; private long timeoutMs; private long failureTime; - private Compatibility compatibility; + private StorageSchemaCompatibility compatibility; private Schema currentSchema; - private HdfsSinkConnectorConfig connectorConfig; private String extension; private final String zeroPadOffsetFormat; private DateTimeZone timeZone; @@ -94,7 +101,8 @@ public class TopicPartitionWriter { private final boolean hiveIntegration; private String hiveDatabase; private HiveMetaStore hiveMetaStore; - private SchemaFileReader schemaFileReader; + private io.confluent.connect.storage.format.SchemaFileReader + schemaFileReader; private HiveUtil hive; private ExecutorService executorService; private Queue> hiveUpdateFutures; @@ -102,46 +110,63 @@ public class TopicPartitionWriter { public TopicPartitionWriter( TopicPartition tp, - Storage storage, + HdfsStorage storage, RecordWriterProvider writerProvider, + io.confluent.connect.storage.format.RecordWriterProvider newWriterProvider, Partitioner partitioner, HdfsSinkConnectorConfig connectorConfig, SinkTaskContext context, AvroData avroData) { - this(tp, storage, writerProvider, partitioner, connectorConfig, context, avroData, null, null, null, null, null); + this( + tp, + storage, + writerProvider, + newWriterProvider, + partitioner, + connectorConfig, + context, + avroData, + null, + null, + null, + null, + null + ); } public TopicPartitionWriter( TopicPartition tp, - Storage storage, + HdfsStorage storage, RecordWriterProvider writerProvider, + io.confluent.connect.storage.format.RecordWriterProvider newWriterProvider, Partitioner partitioner, HdfsSinkConnectorConfig connectorConfig, SinkTaskContext context, AvroData avroData, HiveMetaStore hiveMetaStore, HiveUtil hive, - SchemaFileReader schemaFileReader, + io.confluent.connect.storage.format.SchemaFileReader + schemaFileReader, ExecutorService executorService, Queue> hiveUpdateFutures) { this.tp = tp; - this.connectorConfig = connectorConfig; this.context = context; this.avroData = avroData; this.storage = storage; this.writerProvider = writerProvider; + this.newWriterProvider = newWriterProvider; this.partitioner = partitioner; this.url = storage.url(); - this.conf = storage.conf(); + this.connectorConfig = storage.conf(); this.schemaFileReader = schemaFileReader; - topicsDir = connectorConfig.getString(HdfsSinkConnectorConfig.TOPICS_DIR_CONFIG); + topicsDir = connectorConfig.getString(StorageCommonConfig.TOPICS_DIR_CONFIG); flushSize = connectorConfig.getInt(HdfsSinkConnectorConfig.FLUSH_SIZE_CONFIG); rotateIntervalMs = connectorConfig.getLong(HdfsSinkConnectorConfig.ROTATE_INTERVAL_MS_CONFIG); rotateScheduleIntervalMs = connectorConfig.getLong(HdfsSinkConnectorConfig.ROTATE_SCHEDULE_INTERVAL_MS_CONFIG); timeoutMs = connectorConfig.getLong(HdfsSinkConnectorConfig.RETRY_BACKOFF_CONFIG); - compatibility = SchemaUtils.getCompatibility( - connectorConfig.getString(HdfsSinkConnectorConfig.SCHEMA_COMPATIBILITY_CONFIG)); + compatibility = StorageSchemaCompatibility.getCompatibility( + connectorConfig.getString(HiveConfig.SCHEMA_COMPATIBILITY_CONFIG)); String logsDir = connectorConfig.getString(HdfsSinkConnectorConfig.LOGS_DIR_CONFIG); wal = storage.wal(logsDir, tp); @@ -155,15 +180,22 @@ public TopicPartitionWriter( state = State.RECOVERY_STARTED; failureTime = -1L; offset = -1L; - extension = writerProvider.getExtension(); + if (writerProvider != null) { + extension = writerProvider.getExtension(); + } else if (newWriterProvider != null) { + extension = newWriterProvider.getExtension(); + } else { + throw new ConnectException("Invalid state: either old or new RecordWriterProvider must be" + + " provided"); + } zeroPadOffsetFormat = "%0" + connectorConfig.getInt(HdfsSinkConnectorConfig.FILENAME_OFFSET_ZERO_PAD_WIDTH_CONFIG) + "d"; - hiveIntegration = connectorConfig.getBoolean(HdfsSinkConnectorConfig.HIVE_INTEGRATION_CONFIG); + hiveIntegration = connectorConfig.getBoolean(HiveConfig.HIVE_INTEGRATION_CONFIG); if (hiveIntegration) { - hiveDatabase = connectorConfig.getString(HdfsSinkConnectorConfig.HIVE_DATABASE_CONFIG); + hiveDatabase = connectorConfig.getString(HiveConfig.HIVE_DATABASE_CONFIG); this.hiveMetaStore = hiveMetaStore; this.hive = hive; this.executorService = executorService; @@ -172,7 +204,7 @@ public TopicPartitionWriter( } if(rotateScheduleIntervalMs > 0) { - timeZone = DateTimeZone.forID(connectorConfig.getString(HdfsSinkConnectorConfig.TIMEZONE_CONFIG)); + timeZone = DateTimeZone.forID(connectorConfig.getString(PartitionerConfig.TIMEZONE_CONFIG)); } // Initialize rotation timers @@ -265,18 +297,26 @@ public void write() { nextState(); case WRITE_PARTITION_PAUSED: if (currentSchema == null) { - if (compatibility != Compatibility.NONE && offset != -1) { + if (compatibility != StorageSchemaCompatibility.NONE && offset != -1) { String topicDir = FileUtils.topicDirectory(url, topicsDir, tp.topic()); CommittedFileFilter filter = new TopicPartitionCommittedFileFilter(tp); - FileStatus fileStatusWithMaxOffset = FileUtils.fileStatusWithMaxOffset(storage, new Path(topicDir), filter); + FileStatus fileStatusWithMaxOffset = FileUtils.fileStatusWithMaxOffset( + storage, + new Path(topicDir), + filter + ); if (fileStatusWithMaxOffset != null) { - currentSchema = schemaFileReader.getSchema(conf, fileStatusWithMaxOffset.getPath()); + currentSchema = schemaFileReader.getSchema( + connectorConfig, + fileStatusWithMaxOffset.getPath() + ); } } } SinkRecord record = buffer.peek(); Schema valueSchema = record.valueSchema(); - if (SchemaUtils.shouldChangeSchema(valueSchema, currentSchema, compatibility)) { + if ((recordCounter <= 0 && currentSchema == null && valueSchema != null) + || compatibility.shouldChangeSchema(record, null, currentSchema)) { currentSchema = valueSchema; if (hiveIntegration) { createHiveTable(); @@ -288,7 +328,7 @@ public void write() { break; } } else { - SinkRecord projectedRecord = SchemaUtils.project(record, currentSchema, compatibility); + SinkRecord projectedRecord = compatibility.project(record, null, currentSchema); writeRecord(projectedRecord); buffer.poll(); if (shouldRotate(now)) { @@ -318,7 +358,7 @@ public void write() { } } catch (SchemaProjectorException | IllegalWorkerStateException | HiveMetaStoreException e ) { throw new RuntimeException(e); - } catch (IOException | ConnectException e) { + } catch (ConnectException e) { log.error("Exception on topic partition {}: ", tp, e); failureTime = System.currentTimeMillis(); setRetryTimeout(timeoutMs); @@ -335,7 +375,7 @@ public void write() { closeTempFile(); appendToWAL(); commitFile(); - } catch (IOException e) { + } catch (DataException e) { log.error("Exception on topic partition {}: ", tp, e); failureTime = System.currentTimeMillis(); setRetryTimeout(timeoutMs); @@ -358,7 +398,7 @@ public void close() throws ConnectException { closeTempFile(encodedPartition); deleteTempFile(encodedPartition); } - } catch (IOException e) { + } catch (DataException e) { log.error("Error discarding temp file {} for {} {} when closing TopicPartitionWriter:", tempFiles.get(encodedPartition), tp, encodedPartition, e); } @@ -393,7 +433,7 @@ public long offset() { return offset; } - public Map> getWriters() { + Map getWriters() { return writers; } @@ -401,10 +441,6 @@ public Map getTempFiles() { return tempFiles; } - public String getExtension() { - return writerProvider.getExtension(); - } - private String getDirectory(String encodedPartition) { return partitioner.generatePartitionedPath(tp.topic(), encodedPartition); } @@ -426,15 +462,11 @@ private boolean shouldRotate(long now) { } private void readOffset() throws ConnectException { - try { - String path = FileUtils.topicDirectory(url, topicsDir, tp.topic()); - CommittedFileFilter filter = new TopicPartitionCommittedFileFilter(tp); - FileStatus fileStatusWithMaxOffset = FileUtils.fileStatusWithMaxOffset(storage, new Path(path), filter); - if (fileStatusWithMaxOffset != null) { - offset = FileUtils.extractOffset(fileStatusWithMaxOffset.getPath().getName()) + 1; - } - } catch (IOException e) { - throw new ConnectException(e); + String path = FileUtils.topicDirectory(url, topicsDir, tp.topic()); + CommittedFileFilter filter = new TopicPartitionCommittedFileFilter(tp); + FileStatus fileStatusWithMaxOffset = FileUtils.fileStatusWithMaxOffset(storage, new Path(path), filter); + if (fileStatusWithMaxOffset != null) { + offset = FileUtils.extractOffset(fileStatusWithMaxOffset.getPath().getName()) + 1; } } @@ -446,23 +478,42 @@ private void resume() { context.resume(tp); } - private RecordWriter getWriter(SinkRecord record, String encodedPartition) - throws ConnectException { + private io.confluent.connect.storage.format.RecordWriter getWriter( + SinkRecord record, + String encodedPartition + ) throws ConnectException { + if (writers.containsKey(encodedPartition)) { + return writers.get(encodedPartition); + } + String tempFile = getTempFile(encodedPartition); + + final io.confluent.connect.storage.format.RecordWriter writer; try { - if (writers.containsKey(encodedPartition)) { - return writers.get(encodedPartition); - } - String tempFile = getTempFile(encodedPartition); - RecordWriter writer = writerProvider.getRecordWriter(conf, tempFile, record, avroData); - writers.put(encodedPartition, writer); - if (hiveIntegration && !hivePartitions.contains(encodedPartition)) { - addHivePartition(encodedPartition); - hivePartitions.add(encodedPartition); + if (writerProvider != null) { + writer = new OldRecordWriterWrapper( + writerProvider.getRecordWriter( + connectorConfig.getHadoopConfiguration(), + tempFile, + record, + avroData + ) + ); + } else if (newWriterProvider != null) { + writer = newWriterProvider.getRecordWriter(connectorConfig, tempFile); + } else { + throw new ConnectException("Invalid state: either old or new RecordWriterProvider must be" + + " provided"); } - return writer; } catch (IOException e) { - throw new ConnectException(e); + throw new ConnectException("Couldn't create RecordWriter", e); } + + writers.put(encodedPartition, writer); + if (hiveIntegration && !hivePartitions.contains(encodedPartition)) { + addHivePartition(encodedPartition); + hivePartitions.add(encodedPartition); + } + return writer; } private String getTempFile(String encodedPartition) { @@ -505,39 +556,37 @@ private void resetOffsets() throws ConnectException { } } - private void writeRecord(SinkRecord record) throws IOException { + private void writeRecord(SinkRecord record) { if (offset == -1) { offset = record.kafkaOffset(); } String encodedPartition = partitioner.encodePartition(record); - RecordWriter writer = getWriter(record, encodedPartition); + io.confluent.connect.storage.format.RecordWriter writer = getWriter(record, encodedPartition); writer.write(record); if (!startOffsets.containsKey(encodedPartition)) { startOffsets.put(encodedPartition, record.kafkaOffset()); - offsets.put(encodedPartition, record.kafkaOffset()); - } else { - offsets.put(encodedPartition, record.kafkaOffset()); } + offsets.put(encodedPartition, record.kafkaOffset()); recordCounter++; } - private void closeTempFile(String encodedPartition) throws IOException { + private void closeTempFile(String encodedPartition) { if (writers.containsKey(encodedPartition)) { - RecordWriter writer = writers.get(encodedPartition); + io.confluent.connect.storage.format.RecordWriter writer = writers.get(encodedPartition); writer.close(); writers.remove(encodedPartition); } } - private void closeTempFile() throws IOException { + private void closeTempFile() { for (String encodedPartition: tempFiles.keySet()) { closeTempFile(encodedPartition); } } - private void appendToWAL(String encodedPartition) throws IOException { + private void appendToWAL(String encodedPartition) { String tempFile = tempFiles.get(encodedPartition); if (appended.contains(tempFile)) { return; @@ -555,7 +604,7 @@ private void appendToWAL(String encodedPartition) throws IOException { appended.add(tempFile); } - private void appendToWAL() throws IOException { + private void appendToWAL() { beginAppend(); for (String encodedPartition: tempFiles.keySet()) { appendToWAL(encodedPartition); @@ -563,50 +612,50 @@ private void appendToWAL() throws IOException { endAppend(); } - private void beginAppend() throws IOException { + private void beginAppend() { if (!appended.contains(WAL.beginMarker)) { wal.append(WAL.beginMarker, ""); } } - private void endAppend() throws IOException { + private void endAppend() { if (!appended.contains(WAL.endMarker)) { wal.append(WAL.endMarker, ""); } } - private void commitFile() throws IOException { + private void commitFile() { appended.clear(); for (String encodedPartition: tempFiles.keySet()) { commitFile(encodedPartition); } } - private void commitFile(String encodedPartiton) throws IOException { - if (!startOffsets.containsKey(encodedPartiton)) { + private void commitFile(String encodedPartition) { + if (!startOffsets.containsKey(encodedPartition)) { return; } - long startOffset = startOffsets.get(encodedPartiton); - long endOffset = offsets.get(encodedPartiton); - String tempFile = tempFiles.get(encodedPartiton); - String directory = getDirectory(encodedPartiton); + long startOffset = startOffsets.get(encodedPartition); + long endOffset = offsets.get(encodedPartition); + String tempFile = tempFiles.get(encodedPartition); + String directory = getDirectory(encodedPartition); String committedFile = FileUtils.committedFileName(url, topicsDir, directory, tp, startOffset, endOffset, extension, zeroPadOffsetFormat); String directoryName = FileUtils.directoryName(url, topicsDir, directory); if (!storage.exists(directoryName)) { - storage.mkdirs(directoryName); + storage.create(directoryName); } storage.commit(tempFile, committedFile); - startOffsets.remove(encodedPartiton); + startOffsets.remove(encodedPartition); offset = offset + recordCounter; recordCounter = 0; log.info("Committed {} for {}", committedFile, tp); } - private void deleteTempFile(String encodedPartiton) throws IOException { - storage.delete(tempFiles.get(encodedPartiton)); + private void deleteTempFile(String encodedPartition) { + storage.delete(tempFiles.get(encodedPartition)); } private void setRetryTimeout(long timeoutMs) { @@ -617,7 +666,11 @@ private void createHiveTable() { Future future = executorService.submit(new Callable() { @Override public Void call() throws HiveMetaStoreException { - hive.createTable(hiveDatabase, tp.topic(), currentSchema, partitioner); + try { + hive.createTable(hiveDatabase, tp.topic(), currentSchema, partitioner); + } catch (Throwable e) { + log.error("Creating Hive table threw unexpected error", e); + } return null; } }); @@ -628,7 +681,11 @@ private void alterHiveSchema() { Future future = executorService.submit(new Callable() { @Override public Void call() throws HiveMetaStoreException { - hive.alterSchema(hiveDatabase, tp.topic(), currentSchema); + try { + hive.alterSchema(hiveDatabase, tp.topic(), currentSchema); + } catch (Throwable e) { + log.error("Altering Hive schema threw unexpected error", e); + } return null; } }); @@ -639,7 +696,11 @@ private void addHivePartition(final String location) { Future future = executorService.submit(new Callable() { @Override public Void call() throws Exception { - hiveMetaStore.addPartition(hiveDatabase, tp.topic(), location); + try { + hiveMetaStore.addPartition(hiveDatabase, tp.topic(), location); + } catch (Throwable e) { + log.error("Adding Hive partition threw unexpected error", e); + } return null; } }); diff --git a/src/main/java/io/confluent/connect/hdfs/avro/AvroFileReader.java b/src/main/java/io/confluent/connect/hdfs/avro/AvroFileReader.java index 9773b1d7f..85954e026 100644 --- a/src/main/java/io/confluent/connect/hdfs/avro/AvroFileReader.java +++ b/src/main/java/io/confluent/connect/hdfs/avro/AvroFileReader.java @@ -23,16 +23,19 @@ import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.kafka.connect.data.Schema; +import org.apache.kafka.connect.errors.DataException; import java.io.IOException; import java.util.ArrayList; import java.util.Collection; +import java.util.Iterator; import io.confluent.connect.avro.AvroData; +import io.confluent.connect.hdfs.HdfsSinkConnectorConfig; import io.confluent.connect.hdfs.SchemaFileReader; -public class AvroFileReader implements SchemaFileReader { - +public class AvroFileReader + implements io.confluent.connect.storage.format.SchemaFileReader { private AvroData avroData; public AvroFileReader(AvroData avroData) { @@ -40,25 +43,34 @@ public AvroFileReader(AvroData avroData) { } @Override - public Schema getSchema(Configuration conf, Path path) throws IOException { - SeekableInput input = new FsInput(path, conf); - DatumReader reader = new GenericDatumReader<>(); - FileReader fileReader = DataFileReader.openReader(input, reader); - org.apache.avro.Schema schema = fileReader.getSchema(); - fileReader.close(); - return avroData.toConnectSchema(schema); + public Schema getSchema(HdfsSinkConnectorConfig conf, Path path) { + try { + SeekableInput input = new FsInput(path, conf.getHadoopConfiguration()); + DatumReader reader = new GenericDatumReader<>(); + FileReader fileReader = DataFileReader.openReader(input, reader); + org.apache.avro.Schema schema = fileReader.getSchema(); + fileReader.close(); + return avroData.toConnectSchema(schema); + } catch (IOException e) { + throw new DataException(e); + } } - @Override - public Collection readData(Configuration conf, Path path) throws IOException { - ArrayList collection = new ArrayList<>(); - SeekableInput input = new FsInput(path, conf); - DatumReader reader = new GenericDatumReader<>(); - FileReader fileReader = DataFileReader.openReader(input, reader); - for (Object object: fileReader) { - collection.add(object); - } - fileReader.close(); - return collection; + public boolean hasNext() { + throw new UnsupportedOperationException(); + } + + public Object next() { + throw new UnsupportedOperationException(); } + + public void remove() { + throw new UnsupportedOperationException(); + } + + public Iterator iterator() { + throw new UnsupportedOperationException(); + } + + public void close() {} } diff --git a/src/main/java/io/confluent/connect/hdfs/avro/AvroFormat.java b/src/main/java/io/confluent/connect/hdfs/avro/AvroFormat.java index b5a047b8b..1be3ecd7a 100644 --- a/src/main/java/io/confluent/connect/hdfs/avro/AvroFormat.java +++ b/src/main/java/io/confluent/connect/hdfs/avro/AvroFormat.java @@ -10,29 +10,42 @@ * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express * or implied. See the License for the specific language governing permissions and limitations under * the License. - **/ + */ package io.confluent.connect.hdfs.avro; +import org.apache.hadoop.fs.Path; + import io.confluent.connect.avro.AvroData; -import io.confluent.connect.hdfs.Format; import io.confluent.connect.hdfs.HdfsSinkConnectorConfig; -import io.confluent.connect.hdfs.RecordWriterProvider; -import io.confluent.connect.hdfs.SchemaFileReader; -import io.confluent.connect.hdfs.hive.HiveMetaStore; -import io.confluent.connect.hdfs.hive.HiveUtil; +import io.confluent.connect.hdfs.storage.HdfsStorage; +import io.confluent.connect.storage.format.RecordWriterProvider; +import io.confluent.connect.storage.format.SchemaFileReader; +import io.confluent.connect.storage.hive.HiveFactory; + +public class AvroFormat + implements io.confluent.connect.storage.format.Format { + private final AvroData avroData; -public class AvroFormat implements Format { + // DO NOT change this signature, it is required for instantiation via reflection + public AvroFormat(HdfsStorage storage) { + this.avroData = new AvroData( + storage.conf().getInt(HdfsSinkConnectorConfig.SCHEMA_CACHE_SIZE_CONFIG) + ); + } - public RecordWriterProvider getRecordWriterProvider() { - return new AvroRecordWriterProvider(); + @Override + public RecordWriterProvider getRecordWriterProvider() { + return new AvroRecordWriterProvider(avroData); } - public SchemaFileReader getSchemaFileReader(AvroData avroData) { + @Override + public SchemaFileReader getSchemaFileReader() { return new AvroFileReader(avroData); } - public HiveUtil getHiveUtil(HdfsSinkConnectorConfig config, AvroData avroData, HiveMetaStore hiveMetaStore) { - return new AvroHiveUtil(config, avroData, hiveMetaStore); + @Override + public HiveFactory getHiveFactory() { + return new AvroHiveFactory(avroData); } } diff --git a/src/main/java/io/confluent/connect/hdfs/avro/AvroHiveFactory.java b/src/main/java/io/confluent/connect/hdfs/avro/AvroHiveFactory.java new file mode 100644 index 000000000..7b24d8306 --- /dev/null +++ b/src/main/java/io/confluent/connect/hdfs/avro/AvroHiveFactory.java @@ -0,0 +1,46 @@ +/* + * Copyright 2016 Confluent Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file exceptin compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.confluent.connect.hdfs.avro; + +import org.apache.kafka.common.config.AbstractConfig; + +import io.confluent.connect.avro.AvroData; +import io.confluent.connect.hdfs.HdfsSinkConnectorConfig; +import io.confluent.connect.hdfs.hive.HiveMetaStore; +import io.confluent.connect.hdfs.hive.HiveUtil; +import io.confluent.connect.storage.hive.HiveFactory; + +public class AvroHiveFactory implements HiveFactory { + private final AvroData avroData; + + public AvroHiveFactory(AvroData avroData) { + this.avroData = avroData; + } + + @Override + public io.confluent.connect.storage.hive.HiveUtil createHiveUtil( + AbstractConfig conf, + io.confluent.connect.storage.hive.HiveMetaStore hiveMetaStore + ) { + return createHiveUtil((HdfsSinkConnectorConfig) conf, (HiveMetaStore) hiveMetaStore); + } + + @Deprecated + public HiveUtil createHiveUtil(HdfsSinkConnectorConfig conf, HiveMetaStore hiveMetaStore) { + return new AvroHiveUtil(conf, avroData, hiveMetaStore); + } +} diff --git a/src/main/java/io/confluent/connect/hdfs/avro/AvroHiveUtil.java b/src/main/java/io/confluent/connect/hdfs/avro/AvroHiveUtil.java index 85d9de8fb..b11662457 100644 --- a/src/main/java/io/confluent/connect/hdfs/avro/AvroHiveUtil.java +++ b/src/main/java/io/confluent/connect/hdfs/avro/AvroHiveUtil.java @@ -10,7 +10,7 @@ * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express * or implied. See the License for the specific language governing permissions and limitations under * the License. - **/ + */ package io.confluent.connect.hdfs.avro; @@ -24,24 +24,28 @@ import java.util.List; import io.confluent.connect.avro.AvroData; -import io.confluent.connect.hdfs.FileUtils; -import io.confluent.connect.hdfs.errors.HiveMetaStoreException; +import io.confluent.connect.hdfs.HdfsSinkConnectorConfig; import io.confluent.connect.hdfs.hive.HiveMetaStore; -import io.confluent.connect.hdfs.hive.HiveSchemaConverter; import io.confluent.connect.hdfs.hive.HiveUtil; import io.confluent.connect.hdfs.partitioner.Partitioner; -import io.confluent.connect.hdfs.HdfsSinkConnectorConfig; +import io.confluent.connect.storage.common.StorageCommonConfig; +import io.confluent.connect.storage.errors.HiveMetaStoreException; +import io.confluent.connect.storage.hive.HiveSchemaConverter; public class AvroHiveUtil extends HiveUtil { - private static final String avroSerde = "org.apache.hadoop.hive.serde2.avro.AvroSerDe"; - private static final String avroInputFormat = "org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat"; - private static final String avroOutputFormat = "org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat"; + private static final String AVRO_SERDE = "org.apache.hadoop.hive.serde2.avro.AvroSerDe"; + private static final String AVRO_INPUT_FORMAT = "org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat"; + private static final String AVRO_OUTPUT_FORMAT = "org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat"; private static final String AVRO_SCHEMA_LITERAL = "avro.schema.literal"; + private final AvroData avroData; + private final String topicsDir; - - public AvroHiveUtil(HdfsSinkConnectorConfig connectorConfig, AvroData avroData, HiveMetaStore hiveMetaStore) { - super(connectorConfig, avroData, hiveMetaStore); + public AvroHiveUtil(HdfsSinkConnectorConfig conf, AvroData avroData, HiveMetaStore + hiveMetaStore) { + super(conf, hiveMetaStore); + this.avroData = avroData; + this.topicsDir = conf.getString(StorageCommonConfig.TOPICS_DIR_CONFIG); } @Override @@ -63,12 +67,12 @@ private Table constructAvroTable(String database, String tableName, Schema schem Table table = newTable(database, tableName); table.setTableType(TableType.EXTERNAL_TABLE); table.getParameters().put("EXTERNAL", "TRUE"); - String tablePath = FileUtils.hiveDirectoryName(url, topicsDir, tableName); + String tablePath = hiveDirectoryName(url, topicsDir, tableName); table.setDataLocation(new Path(tablePath)); - table.setSerializationLib(avroSerde); + table.setSerializationLib(AVRO_SERDE); try { - table.setInputFormatClass(avroInputFormat); - table.setOutputFormatClass(avroOutputFormat); + table.setInputFormatClass(AVRO_INPUT_FORMAT); + table.setOutputFormatClass(AVRO_OUTPUT_FORMAT); } catch (HiveException e) { throw new HiveMetaStoreException("Cannot find input/output format:", e); } diff --git a/src/main/java/io/confluent/connect/hdfs/avro/AvroRecordWriterProvider.java b/src/main/java/io/confluent/connect/hdfs/avro/AvroRecordWriterProvider.java index bbf89b3e0..667aeb8f2 100644 --- a/src/main/java/io/confluent/connect/hdfs/avro/AvroRecordWriterProvider.java +++ b/src/main/java/io/confluent/connect/hdfs/avro/AvroRecordWriterProvider.java @@ -1,29 +1,26 @@ /** * Copyright 2015 Confluent Inc. * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at + * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except + * in compliance with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - **/ + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. + */ package io.confluent.connect.hdfs.avro; -import io.confluent.kafka.serializers.NonRecordContainer; import org.apache.avro.file.DataFileWriter; import org.apache.avro.generic.GenericDatumWriter; -import org.apache.avro.io.DatumWriter; -import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FSDataOutputStream; import org.apache.hadoop.fs.Path; import org.apache.kafka.connect.data.Schema; +import org.apache.kafka.connect.errors.ConnectException; +import org.apache.kafka.connect.errors.DataException; import org.apache.kafka.connect.sink.SinkRecord; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -31,13 +28,18 @@ import java.io.IOException; import io.confluent.connect.avro.AvroData; -import io.confluent.connect.hdfs.RecordWriter; -import io.confluent.connect.hdfs.RecordWriterProvider; - -public class AvroRecordWriterProvider implements RecordWriterProvider { +import io.confluent.connect.hdfs.HdfsSinkConnectorConfig; +import io.confluent.kafka.serializers.NonRecordContainer; +public class AvroRecordWriterProvider + implements io.confluent.connect.storage.format.RecordWriterProvider { private static final Logger log = LoggerFactory.getLogger(AvroRecordWriterProvider.class); private final static String EXTENSION = ".avro"; + private final AvroData avroData; + + AvroRecordWriterProvider(AvroData avroData) { + this.avroData = avroData; + } @Override public String getExtension() { @@ -45,34 +47,54 @@ public String getExtension() { } @Override - public RecordWriter getRecordWriter(Configuration conf, final String fileName, - SinkRecord record, final AvroData avroData) - throws IOException { - DatumWriter datumWriter = new GenericDatumWriter<>(); - final DataFileWriter writer = new DataFileWriter<>(datumWriter); - Path path = new Path(fileName); + public io.confluent.connect.storage.format.RecordWriter getRecordWriter( + final HdfsSinkConnectorConfig conf, + final String filename + ) { + return new io.confluent.connect.storage.format.RecordWriter() { + final DataFileWriter writer = new DataFileWriter<>(new GenericDatumWriter<>()); + final Path path = new Path(filename); + Schema schema = null; - final Schema schema = record.valueSchema(); - final FSDataOutputStream out = path.getFileSystem(conf).create(path); - org.apache.avro.Schema avroSchema = avroData.fromConnectSchema(schema); - writer.create(avroSchema, out); - - return new RecordWriter(){ @Override - public void write(SinkRecord record) throws IOException { + public void write(SinkRecord record) { + if (schema == null) { + schema = record.valueSchema(); + try { + log.info("Opening record writer for: {}", filename); + final FSDataOutputStream out = path.getFileSystem(conf.getHadoopConfiguration()) + .create(path); + org.apache.avro.Schema avroSchema = avroData.fromConnectSchema(schema); + writer.create(avroSchema, out); + } catch (IOException e) { + throw new ConnectException(e); + } + } + log.trace("Sink record: {}", record.toString()); Object value = avroData.fromConnectData(schema, record.value()); - // AvroData wraps primitive types so their schema can be included. We need to unwrap NonRecordContainers to just - // their value to properly handle these types - if (value instanceof NonRecordContainer) - writer.append(((NonRecordContainer) value).getValue()); - else - writer.append(value); + try { + // AvroData wraps primitive types so their schema can be included. We need to unwrap NonRecordContainers to just + // their value to properly handle these types + if (value instanceof NonRecordContainer) + writer.append(((NonRecordContainer) value).getValue()); + else + writer.append(value); + } catch (IOException e) { + throw new DataException(e); + } } @Override - public void close() throws IOException { - writer.close(); + public void commit() {} + + @Override + public void close() { + try { + writer.close(); + } catch (IOException e) { + throw new DataException(e); + } } }; } diff --git a/src/main/java/io/confluent/connect/hdfs/errors/HiveMetaStoreException.java b/src/main/java/io/confluent/connect/hdfs/errors/HiveMetaStoreException.java index 56ffb62c9..165810e38 100644 --- a/src/main/java/io/confluent/connect/hdfs/errors/HiveMetaStoreException.java +++ b/src/main/java/io/confluent/connect/hdfs/errors/HiveMetaStoreException.java @@ -14,7 +14,8 @@ package io.confluent.connect.hdfs.errors; -public class HiveMetaStoreException extends RuntimeException{ +@Deprecated +public class HiveMetaStoreException extends io.confluent.connect.storage.errors.HiveMetaStoreException { public HiveMetaStoreException(String s) { super(s); diff --git a/src/main/java/io/confluent/connect/hdfs/errors/PartitionException.java b/src/main/java/io/confluent/connect/hdfs/errors/PartitionException.java index 074881c19..e7142ed2f 100644 --- a/src/main/java/io/confluent/connect/hdfs/errors/PartitionException.java +++ b/src/main/java/io/confluent/connect/hdfs/errors/PartitionException.java @@ -14,7 +14,8 @@ package io.confluent.connect.hdfs.errors; -public class PartitionException extends RuntimeException { +@Deprecated +public class PartitionException extends io.confluent.connect.storage.errors.PartitionException { public PartitionException(String s) { super(s); diff --git a/src/main/java/io/confluent/connect/hdfs/filter/TopicPartitionCommittedFileFilter.java b/src/main/java/io/confluent/connect/hdfs/filter/TopicPartitionCommittedFileFilter.java index efa4aa449..3feffbadb 100644 --- a/src/main/java/io/confluent/connect/hdfs/filter/TopicPartitionCommittedFileFilter.java +++ b/src/main/java/io/confluent/connect/hdfs/filter/TopicPartitionCommittedFileFilter.java @@ -17,11 +17,10 @@ import org.apache.hadoop.fs.Path; import org.apache.kafka.common.TopicPartition; -import io.confluent.connect.hdfs.HdfsSinkConnector; -import io.confluent.connect.hdfs.HdfsSinkConnectorConstants; - import java.util.regex.Matcher; +import io.confluent.connect.hdfs.HdfsSinkConnectorConstants; + public class TopicPartitionCommittedFileFilter extends CommittedFileFilter { private TopicPartition tp; diff --git a/src/main/java/io/confluent/connect/hdfs/hive/HiveMetaStore.java b/src/main/java/io/confluent/connect/hdfs/hive/HiveMetaStore.java index dca869a23..fd1bbb987 100644 --- a/src/main/java/io/confluent/connect/hdfs/hive/HiveMetaStore.java +++ b/src/main/java/io/confluent/connect/hdfs/hive/HiveMetaStore.java @@ -16,346 +16,15 @@ package io.confluent.connect.hdfs.hive; -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.hive.conf.HiveConf; -import org.apache.hadoop.hive.metastore.IMetaStoreClient; -import org.apache.hadoop.hive.metastore.api.AlreadyExistsException; -import org.apache.hadoop.hive.metastore.api.Database; -import org.apache.hadoop.hive.metastore.api.InvalidObjectException; -import org.apache.hadoop.hive.metastore.api.InvalidOperationException; -import org.apache.hadoop.hive.metastore.api.MetaException; -import org.apache.hadoop.hive.metastore.api.NoSuchObjectException; -import org.apache.hadoop.hive.metastore.api.Partition; -import org.apache.hadoop.hive.metastore.api.UnknownDBException; -import org.apache.hadoop.hive.ql.metadata.Table; -import org.apache.hive.hcatalog.common.HCatUtil; -import org.apache.thrift.TException; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.io.File; -import java.io.IOException; -import java.util.ArrayList; -import java.util.List; - import io.confluent.connect.hdfs.HdfsSinkConnectorConfig; import io.confluent.connect.hdfs.errors.HiveMetaStoreException; +import org.apache.hadoop.conf.Configuration; -public class HiveMetaStore { - - private static final Logger log = LoggerFactory.getLogger(HiveMetaStore.class); - private final IMetaStoreClient client; +@Deprecated +public class HiveMetaStore extends io.confluent.connect.storage.hive.HiveMetaStore { public HiveMetaStore(Configuration conf, HdfsSinkConnectorConfig connectorConfig) throws HiveMetaStoreException { - HiveConf hiveConf = new HiveConf(conf, HiveConf.class); - String hiveConfDir = connectorConfig.getString(HdfsSinkConnectorConfig.HIVE_CONF_DIR_CONFIG); - String hiveMetaStoreURIs = connectorConfig.getString(HdfsSinkConnectorConfig.HIVE_METASTORE_URIS_CONFIG); - if (hiveMetaStoreURIs.isEmpty()) { - log.warn("hive.metastore.uris empty, an embedded Hive metastore will be " - + "created in the directory the connector is started. " - + "You need to start Hive in that specific directory to query the data."); - } - if (!hiveConfDir.equals("")) { - String hiveSitePath = hiveConfDir + "/hive-site.xml"; - File hiveSite = new File(hiveSitePath); - if (!hiveSite.exists()) { - log.warn("hive-site.xml does not exist in provided Hive configuration directory {}.", hiveConf); - } - hiveConf.addResource(new Path(hiveSitePath)); - } - hiveConf.set("hive.metastore.uris", hiveMetaStoreURIs); - try { - client = HCatUtil.getHiveMetastoreClient(hiveConf); - } catch (IOException | MetaException e) { - throw new HiveMetaStoreException(e); - } - } - - private interface ClientAction { - R call() throws TException; + super(conf, connectorConfig); } - private R doAction(ClientAction action) throws TException { - // No need to implement retries here. We use RetryingMetaStoreClient - // which creates a proxy for a IMetaStoreClient implementation and - // retries calls to it on failure. The retrying client is conscious - // of the socket timeout and does not call reconnect on an open connection. - // Since HiveMetaStoreClient's reconnect method does not check the status - // of the connection, blind retries may cause a huge spike in the number - // of connections to the Hive MetaStore. - return action.call(); - } - - public void addPartition(final String database, final String tableName, final String path) throws HiveMetaStoreException { - ClientAction addPartition = new ClientAction() { - @Override - public Void call() throws TException { - // purposely don't check if the partition already exists because - // getPartition(db, table, path) will throw an exception to indicate the - // partition doesn't exist also. this way, it's only one call. - client.appendPartition(database, tableNameConverter(tableName), path); - return null; - } - }; - - try { - doAction(addPartition); - } catch (AlreadyExistsException e) { - // this is okay - } catch (InvalidObjectException e) { - throw new HiveMetaStoreException("Invalid partition for " + database + "." + tableNameConverter(tableName) + ": " + path, e); - } catch (MetaException e) { - throw new HiveMetaStoreException("Hive MetaStore exception", e); - } catch (TException e) { - throw new HiveMetaStoreException("Exception communicating with the Hive MetaStore", e); - } - } - - public void dropPartition(final String database, final String tableName, final String path) throws HiveMetaStoreException { - ClientAction dropPartition = new ClientAction() { - @Override - public Void call() throws TException { - client.dropPartition(database, tableNameConverter(tableName), path, false); - return null; - } - }; - - try { - doAction(dropPartition); - } catch (NoSuchObjectException e) { - // this is okay - } catch (InvalidObjectException e) { - throw new HiveMetaStoreException("Invalid partition for " + database + "." + tableNameConverter(tableName) + ": " + path, e); - } catch (MetaException e) { - throw new HiveMetaStoreException("Hive MetaStore exception", e); - } catch (TException e) { - throw new HiveMetaStoreException("Exception communicating with the Hive MetaStore", e); - } - } - - - public void createDatabase(final String database) throws HiveMetaStoreException { - ClientAction create = new ClientAction() { - @Override - public Void call() throws TException { - client.createDatabase(new Database(database, "Database created by Kafka Connect", null, null)); - return null; - } - }; - - try { - doAction(create); - } catch (AlreadyExistsException e) { - log.warn("Hive database already exists: {}", database); - } catch (InvalidObjectException e) { - throw new HiveMetaStoreException("Invalid database: " + database, e); - } catch (MetaException e) { - throw new HiveMetaStoreException("Hive MetaStore exception", e); - } catch (TException e) { - throw new HiveMetaStoreException("Exception communicating with the Hive MetaStore", e); - } - } - - - public void dropDatabase(final String name, final boolean deleteData) throws HiveMetaStoreException { - ClientAction drop = new ClientAction() { - @Override - public Void call() throws TException { - client.dropDatabase(name, deleteData, true); - return null; - } - }; - - try { - doAction(drop); - } catch (NoSuchObjectException e) { - // this is okey - } catch (MetaException e) { - throw new HiveMetaStoreException("Hive MetaStore exception", e); - } catch (TException e) { - throw new HiveMetaStoreException("Exception communicating with the Hive MetaStore", e); - } - } - - public void createTable(final Table table) throws HiveMetaStoreException { - ClientAction create = new ClientAction() { - @Override - public Void call() throws TException { - client.createTable(table.getTTable()); - return null; - } - }; - - createDatabase(table.getDbName()); - - try { - doAction(create); - } catch (NoSuchObjectException e) { - throw new HiveMetaStoreException("Hive table not found: " + table.getDbName() + "." + tableNameConverter(table.getTableName())); - } catch (AlreadyExistsException e) { - // this is okey - log.warn("Hive table already exists: {}.{}", table.getDbName(), table.getTableName()); - } catch (InvalidObjectException e) { - throw new HiveMetaStoreException("Invalid table", e); - } catch (MetaException e) { - throw new HiveMetaStoreException("Hive MetaStore exception", e); - } catch (TException e) { - throw new HiveMetaStoreException("Exception communicating with the Hive MetaStore", e); - } - } - - public void alterTable(final Table table) throws HiveMetaStoreException { - ClientAction alter = new ClientAction() { - @Override - public Void call() throws TException { - client.alter_table(table.getDbName(), tableNameConverter(table.getTableName()), table.getTTable()); - return null; - } - }; - - try { - doAction(alter); - } catch (NoSuchObjectException e) { - throw new HiveMetaStoreException("Hive table not found: " + table.getDbName() + "." + table.getTableName()); - } catch (InvalidObjectException e) { - throw new HiveMetaStoreException("Invalid table", e); - } catch (InvalidOperationException e) { - throw new HiveMetaStoreException("Invalid table change", e); - } catch (MetaException e) { - throw new HiveMetaStoreException("Hive MetaStore exception", e); - } catch (TException e) { - throw new HiveMetaStoreException("Exception communicating with the Hive MetaStore", e); - } - } - - public void dropTable(final String database, final String tableName) { - ClientAction drop = new ClientAction() { - @Override - public Void call() throws TException { - client.dropTable(database, tableNameConverter(tableName), false, true); - return null; - } - }; - - try { - doAction(drop); - } catch (NoSuchObjectException e) { - // this is okay - } catch (MetaException e) { - throw new HiveMetaStoreException("Hive MetaStore exception", e); - } catch (TException e) { - throw new HiveMetaStoreException("Exception communicating with the Hive MetaStore", e); - } - } - - public boolean tableExists(final String database, final String tableName) throws HiveMetaStoreException { - ClientAction exists = new ClientAction() { - @Override - public Boolean call() throws TException { - return client.tableExists(database, tableNameConverter(tableName)); - } - }; - try { - return doAction(exists); - } catch (UnknownDBException e) { - return false; - } catch (MetaException e) { - throw new HiveMetaStoreException("Hive MetaStore exception", e); - } catch (TException e) { - throw new HiveMetaStoreException("Exception communicating with the Hive MetaStore", e); - } - } - - public Table getTable(final String database, final String tableName) throws HiveMetaStoreException { - ClientAction getTable = new ClientAction
() { - @Override - public Table call() throws TException { - return new Table(client.getTable(database, tableNameConverter(tableName))); - } - }; - - Table table; - try { - table = doAction(getTable); - } catch (NoSuchObjectException e) { - throw new HiveMetaStoreException("Hive table not found: " + database + "." + tableNameConverter(tableName)); - } catch (MetaException e) { - throw new HiveMetaStoreException("Hive table lookup exception", e); - } catch (TException e) { - throw new HiveMetaStoreException("Exception communicating with the Hive MetaStore", e); - } - - if (table == null) { - throw new HiveMetaStoreException("Could not find info for table: " + tableNameConverter(tableName)); - } - return table; - } - - public List listPartitions(final String database, final String tableName, final short max) throws HiveMetaStoreException { - ClientAction> listPartitions = new ClientAction>() { - @Override - public List call() throws TException { - List partitions = client.listPartitions(database, tableNameConverter(tableName), max); - List paths = new ArrayList<>(); - for (Partition partition : partitions) { - paths.add(partition.getSd().getLocation()); - } - return paths; - } - }; - - try { - return doAction(listPartitions); - } catch (NoSuchObjectException e) { - return new ArrayList<>(); - } catch (MetaException e) { - throw new HiveMetaStoreException("Hive MetaStore exception", e); - } catch (TException e) { - throw new HiveMetaStoreException("Exception communicating with the Hive MetaStore", e); - } - } - - public List getAllTables(final String database) throws HiveMetaStoreException { - ClientAction> getAllTables = new ClientAction>() { - @Override - public List call() throws TException { - return client.getAllTables(database); - } - }; - - try { - return doAction(getAllTables); - } catch (NoSuchObjectException e) { - return new ArrayList<>(); - } catch (MetaException e) { - throw new HiveMetaStoreException("Hive MetaStore exception", e); - } catch (TException e) { - throw new HiveMetaStoreException("Exception communicating with the Hive MetaStore", e); - } - } - - public List getAllDatabases() throws HiveMetaStoreException { - ClientAction> create = - new ClientAction>() { - @Override - public List call() throws TException { - return client.getAllDatabases(); - } - }; - - try { - return doAction(create); - } catch (NoSuchObjectException e) { - return new ArrayList<>(); - } catch (MetaException e) { - throw new HiveMetaStoreException("Hive MetaStore exception", e); - } catch (TException e) { - throw new HiveMetaStoreException("Exception communicating with the Hive MetaStore", e); - } - } - - public String tableNameConverter(String table){ - return table == null ? table : table.replaceAll("\\.", "_"); - } } diff --git a/src/main/java/io/confluent/connect/hdfs/hive/HiveSchemaConverter.java b/src/main/java/io/confluent/connect/hdfs/hive/HiveSchemaConverter.java deleted file mode 100644 index 3366dee48..000000000 --- a/src/main/java/io/confluent/connect/hdfs/hive/HiveSchemaConverter.java +++ /dev/null @@ -1,94 +0,0 @@ -/** - * Copyright 2015 Confluent Inc. - * - * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except - * in compliance with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software distributed under the License - * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express - * or implied. See the License for the specific language governing permissions and limitations under - * the License. - **/ - -package io.confluent.connect.hdfs.hive; - -import org.apache.hadoop.hive.metastore.api.FieldSchema; -import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; -import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory; -import org.apache.kafka.connect.data.Field; -import org.apache.kafka.connect.data.Schema; -import org.apache.kafka.connect.data.Schema.Type; - -import java.util.ArrayList; -import java.util.HashMap; -import java.util.List; -import java.util.Map; - -public class HiveSchemaConverter { - - private static final Map TYPE_TO_TYPEINFO; - - static { - TYPE_TO_TYPEINFO = new HashMap<>(); - TYPE_TO_TYPEINFO.put(Type.BOOLEAN, TypeInfoFactory.booleanTypeInfo); - TYPE_TO_TYPEINFO.put(Type.INT8, TypeInfoFactory.byteTypeInfo); - TYPE_TO_TYPEINFO.put(Type.INT16, TypeInfoFactory.shortTypeInfo); - TYPE_TO_TYPEINFO.put(Type.INT32, TypeInfoFactory.intTypeInfo); - TYPE_TO_TYPEINFO.put(Type.INT64, TypeInfoFactory.longTypeInfo); - TYPE_TO_TYPEINFO.put(Type.FLOAT32, TypeInfoFactory.floatTypeInfo); - TYPE_TO_TYPEINFO.put(Type.FLOAT64, TypeInfoFactory.doubleTypeInfo); - TYPE_TO_TYPEINFO.put(Type.BYTES, TypeInfoFactory.binaryTypeInfo); - TYPE_TO_TYPEINFO.put(Type.STRING, TypeInfoFactory.stringTypeInfo); - } - - public static List convertSchema(Schema schema) { - List columns = new ArrayList<>(); - if (Schema.Type.STRUCT.equals(schema.type())) { - for (Field field: schema.fields()) { - columns.add(new FieldSchema( - field.name(), convert(field.schema()).getTypeName(), field.schema().doc())); - } - } - return columns; - } - - public static TypeInfo convert(Schema schema) { - // TODO: throw an error on recursive types - switch (schema.type()) { - case STRUCT: - return convertStruct(schema); - case ARRAY: - return convertArray(schema); - case MAP: - return convertMap(schema); - default: - return convertPrimitive(schema); - } - } - - public static TypeInfo convertStruct(Schema schema) { - final List fields = schema.fields(); - final List names = new ArrayList<>(fields.size()); - final List types = new ArrayList<>(fields.size()); - for (Field field : fields) { - names.add(field.name()); - types.add(convert(field.schema())); - } - return TypeInfoFactory.getStructTypeInfo(names, types); - } - - public static TypeInfo convertArray(Schema schema) { - return TypeInfoFactory.getListTypeInfo(convert(schema.valueSchema())); - } - - public static TypeInfo convertMap(Schema schema) { - return TypeInfoFactory.getMapTypeInfo( - convert(schema.keySchema()), convert(schema.valueSchema())); - } - - public static TypeInfo convertPrimitive(Schema schema) { - return TYPE_TO_TYPEINFO.get(schema.type()); - } -} diff --git a/src/main/java/io/confluent/connect/hdfs/hive/HiveUtil.java b/src/main/java/io/confluent/connect/hdfs/hive/HiveUtil.java index 7de26e6d1..5d4e7a32c 100644 --- a/src/main/java/io/confluent/connect/hdfs/hive/HiveUtil.java +++ b/src/main/java/io/confluent/connect/hdfs/hive/HiveUtil.java @@ -14,33 +14,45 @@ package io.confluent.connect.hdfs.hive; +import org.apache.hadoop.hive.metastore.api.FieldSchema; import org.apache.hadoop.hive.ql.metadata.Table; import org.apache.kafka.connect.data.Schema; -import io.confluent.connect.avro.AvroData; import io.confluent.connect.hdfs.HdfsSinkConnectorConfig; import io.confluent.connect.hdfs.partitioner.Partitioner; +import io.confluent.connect.storage.common.StorageCommonConfig; -public abstract class HiveUtil { +// NOTE: DO NOT add or modify this class as it is maintained for compatibility +@Deprecated +public abstract class HiveUtil extends io.confluent.connect.storage.hive.HiveUtil { - protected final String url; - protected final String topicsDir; - protected final AvroData avroData; - protected final HiveMetaStore hiveMetaStore; + public HiveUtil(HdfsSinkConnectorConfig connectorConfig, HiveMetaStore hiveMetaStore) { + super(connectorConfig, hiveMetaStore); + String urlKey; + urlKey = connectorConfig.getString(StorageCommonConfig.STORE_URL_CONFIG); + if (urlKey == null || urlKey.equals(StorageCommonConfig.STORE_URL_DEFAULT)) { + urlKey = connectorConfig.getString(HdfsSinkConnectorConfig.HDFS_URL_CONFIG); + } - public HiveUtil(HdfsSinkConnectorConfig connectorConfig, AvroData avroData, HiveMetaStore hiveMetaStore) { - this.url = connectorConfig.getString(HdfsSinkConnectorConfig.HDFS_URL_CONFIG); - this.topicsDir = connectorConfig.getString(HdfsSinkConnectorConfig.TOPICS_DIR_CONFIG); - this.avroData = avroData; - this.hiveMetaStore = hiveMetaStore; + this.url = urlKey; } - public abstract void createTable(String database, String tableName, Schema schema, Partitioner partitioner); - - public abstract void alterSchema(String database, String tableName, Schema schema); - - public Table newTable(String database, String table){ - return new Table(database, hiveMetaStore.tableNameConverter(table)); + @Override + public void createTable( + String database, + String tableName, + Schema schema, + io.confluent.connect.storage.partitioner.Partitioner partitioner + ) { + createTable(database, tableName, schema, (Partitioner) partitioner); } + + public abstract void createTable( + String database, + String tableName, + Schema schema, + Partitioner partitioner + ); + } diff --git a/src/main/java/io/confluent/connect/hdfs/parquet/ParquetFileReader.java b/src/main/java/io/confluent/connect/hdfs/parquet/ParquetFileReader.java index d08e30171..0e68ad364 100644 --- a/src/main/java/io/confluent/connect/hdfs/parquet/ParquetFileReader.java +++ b/src/main/java/io/confluent/connect/hdfs/parquet/ParquetFileReader.java @@ -15,21 +15,20 @@ package io.confluent.connect.hdfs.parquet; import org.apache.avro.generic.GenericRecord; -import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.kafka.connect.data.Schema; +import org.apache.kafka.connect.errors.DataException; import org.apache.parquet.avro.AvroReadSupport; import org.apache.parquet.hadoop.ParquetReader; import java.io.IOException; -import java.util.ArrayList; -import java.util.Collection; +import java.util.Iterator; import io.confluent.connect.avro.AvroData; -import io.confluent.connect.hdfs.SchemaFileReader; - -public class ParquetFileReader implements SchemaFileReader { +import io.confluent.connect.hdfs.HdfsSinkConnectorConfig; +public class ParquetFileReader + implements io.confluent.connect.storage.format.SchemaFileReader { private AvroData avroData; public ParquetFileReader(AvroData avroData) { @@ -37,30 +36,39 @@ public ParquetFileReader(AvroData avroData) { } @Override - public Schema getSchema(Configuration conf, Path path) throws IOException { + public Schema getSchema(HdfsSinkConnectorConfig conf, Path path) { AvroReadSupport readSupport = new AvroReadSupport<>(); ParquetReader.Builder builder = ParquetReader.builder(readSupport, path); - ParquetReader parquetReader = builder.withConf(conf).build(); - GenericRecord record; - Schema schema = null; - while ((record = parquetReader.read()) != null) { - schema = avroData.toConnectSchema(record.getSchema()); + try { + ParquetReader parquetReader = builder.withConf(conf.getHadoopConfiguration()) + .build(); + GenericRecord record; + Schema schema = null; + while ((record = parquetReader.read()) != null) { + schema = avroData.toConnectSchema(record.getSchema()); + } + parquetReader.close(); + return schema; + } catch (IOException e) { + throw new DataException(e); } - parquetReader.close(); - return schema; } - @Override - public Collection readData(Configuration conf, Path path) throws IOException { - Collection result = new ArrayList<>(); - AvroReadSupport readSupport = new AvroReadSupport<>(); - ParquetReader.Builder builder = ParquetReader.builder(readSupport, path); - ParquetReader parquetReader = builder.withConf(conf).build(); - GenericRecord record; - while ((record = parquetReader.read()) != null) { - result.add(record); - } - parquetReader.close(); - return result; + public boolean hasNext() { + throw new UnsupportedOperationException(); + } + + public Object next() { + throw new UnsupportedOperationException(); + } + + public void remove() { + throw new UnsupportedOperationException(); } + + public Iterator iterator() { + throw new UnsupportedOperationException(); + } + + public void close() {} } diff --git a/src/main/java/io/confluent/connect/hdfs/parquet/ParquetFormat.java b/src/main/java/io/confluent/connect/hdfs/parquet/ParquetFormat.java index 2d930e5aa..dd597f320 100644 --- a/src/main/java/io/confluent/connect/hdfs/parquet/ParquetFormat.java +++ b/src/main/java/io/confluent/connect/hdfs/parquet/ParquetFormat.java @@ -10,28 +10,42 @@ * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express * or implied. See the License for the specific language governing permissions and limitations under * the License. - **/ + */ package io.confluent.connect.hdfs.parquet; +import org.apache.hadoop.fs.Path; + import io.confluent.connect.avro.AvroData; -import io.confluent.connect.hdfs.Format; import io.confluent.connect.hdfs.HdfsSinkConnectorConfig; -import io.confluent.connect.hdfs.RecordWriterProvider; -import io.confluent.connect.hdfs.SchemaFileReader; -import io.confluent.connect.hdfs.hive.HiveMetaStore; -import io.confluent.connect.hdfs.hive.HiveUtil; - -public class ParquetFormat implements Format { - public RecordWriterProvider getRecordWriterProvider() { - return new ParquetRecordWriterProvider(); +import io.confluent.connect.hdfs.storage.HdfsStorage; +import io.confluent.connect.storage.format.RecordWriterProvider; +import io.confluent.connect.storage.format.SchemaFileReader; +import io.confluent.connect.storage.hive.HiveFactory; + +public class ParquetFormat + implements io.confluent.connect.storage.format.Format { + private final AvroData avroData; + + // DO NOT change this signature, it is required for instantiation via reflection + public ParquetFormat(HdfsStorage storage) { + this.avroData = new AvroData( + storage.conf().getInt(HdfsSinkConnectorConfig.SCHEMA_CACHE_SIZE_CONFIG) + ); + } + + @Override + public RecordWriterProvider getRecordWriterProvider() { + return new ParquetRecordWriterProvider(avroData); } - public SchemaFileReader getSchemaFileReader(AvroData avroData) { + @Override + public SchemaFileReader getSchemaFileReader() { return new ParquetFileReader(avroData); } - public HiveUtil getHiveUtil(HdfsSinkConnectorConfig config, AvroData avroData, HiveMetaStore hiveMetaStore) { - return new ParquetHiveUtil(config, avroData, hiveMetaStore); + @Override + public HiveFactory getHiveFactory() { + return new ParquetHiveFactory(); } } diff --git a/src/main/java/io/confluent/connect/hdfs/parquet/ParquetHiveFactory.java b/src/main/java/io/confluent/connect/hdfs/parquet/ParquetHiveFactory.java new file mode 100644 index 000000000..6f9a01385 --- /dev/null +++ b/src/main/java/io/confluent/connect/hdfs/parquet/ParquetHiveFactory.java @@ -0,0 +1,40 @@ +/* + * Copyright 2016 Confluent Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file exceptin compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.confluent.connect.hdfs.parquet; + +import org.apache.kafka.common.config.AbstractConfig; + +import io.confluent.connect.avro.AvroData; +import io.confluent.connect.hdfs.HdfsSinkConnectorConfig; +import io.confluent.connect.storage.hive.HiveFactory; +import io.confluent.connect.hdfs.hive.HiveMetaStore; +import io.confluent.connect.storage.hive.HiveUtil; + +public class ParquetHiveFactory implements HiveFactory { + @Override + public HiveUtil createHiveUtil( + AbstractConfig config, + io.confluent.connect.storage.hive.HiveMetaStore hiveMetaStore + ) { + return createHiveUtil((HdfsSinkConnectorConfig) config, (HiveMetaStore) hiveMetaStore); + } + + @Deprecated + public HiveUtil createHiveUtil(HdfsSinkConnectorConfig config, HiveMetaStore hiveMetaStore) { + return new ParquetHiveUtil(config, hiveMetaStore); + } +} diff --git a/src/main/java/io/confluent/connect/hdfs/parquet/ParquetHiveUtil.java b/src/main/java/io/confluent/connect/hdfs/parquet/ParquetHiveUtil.java index 9fa6aaa53..fbf74b23d 100644 --- a/src/main/java/io/confluent/connect/hdfs/parquet/ParquetHiveUtil.java +++ b/src/main/java/io/confluent/connect/hdfs/parquet/ParquetHiveUtil.java @@ -1,18 +1,16 @@ /** * Copyright 2015 Confluent Inc. * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at + * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except + * in compliance with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - **/ + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. + */ package io.confluent.connect.hdfs.parquet; @@ -25,23 +23,29 @@ import java.util.List; -import io.confluent.connect.avro.AvroData; -import io.confluent.connect.hdfs.FileUtils; +import io.confluent.connect.hdfs.HdfsSinkConnectorConfig; import io.confluent.connect.hdfs.hive.HiveMetaStore; -import io.confluent.connect.hdfs.hive.HiveSchemaConverter; import io.confluent.connect.hdfs.hive.HiveUtil; import io.confluent.connect.hdfs.partitioner.Partitioner; -import io.confluent.connect.hdfs.HdfsSinkConnectorConfig; -import io.confluent.connect.hdfs.errors.HiveMetaStoreException; +import io.confluent.connect.storage.common.StorageCommonConfig; +import io.confluent.connect.storage.errors.HiveMetaStoreException; +import io.confluent.connect.storage.hive.HiveSchemaConverter; public class ParquetHiveUtil extends HiveUtil { + private final String topicsDir; - public ParquetHiveUtil(HdfsSinkConnectorConfig connectorConfig, AvroData avroData, HiveMetaStore hiveMetaStore) { - super(connectorConfig, avroData, hiveMetaStore); + public ParquetHiveUtil(HdfsSinkConnectorConfig conf, HiveMetaStore hiveMetaStore) { + super(conf, hiveMetaStore); + this.topicsDir = conf.getString(StorageCommonConfig.TOPICS_DIR_CONFIG); } @Override - public void createTable(String database, String tableName, Schema schema, Partitioner partitioner) throws HiveMetaStoreException { + public void createTable( + String database, + String tableName, + Schema schema, + Partitioner partitioner + ) throws HiveMetaStoreException { Table table = constructParquetTable(database, tableName, schema, partitioner); hiveMetaStore.createTable(table); } @@ -54,11 +58,16 @@ public void alterSchema(String database, String tableName, Schema schema) { hiveMetaStore.alterTable(table); } - private Table constructParquetTable(String database, String tableName, Schema schema, Partitioner partitioner) throws HiveMetaStoreException { + private Table constructParquetTable( + String database, + String tableName, + Schema schema, + Partitioner partitioner + ) throws HiveMetaStoreException { Table table = newTable(database, tableName); table.setTableType(TableType.EXTERNAL_TABLE); table.getParameters().put("EXTERNAL", "TRUE"); - String tablePath = FileUtils.hiveDirectoryName(url, topicsDir, tableName); + String tablePath = hiveDirectoryName(url, topicsDir, tableName); table.setDataLocation(new Path(tablePath)); table.setSerializationLib(getHiveParquetSerde()); try { diff --git a/src/main/java/io/confluent/connect/hdfs/parquet/ParquetRecordWriterProvider.java b/src/main/java/io/confluent/connect/hdfs/parquet/ParquetRecordWriterProvider.java index eb4d1c396..77e2741cb 100644 --- a/src/main/java/io/confluent/connect/hdfs/parquet/ParquetRecordWriterProvider.java +++ b/src/main/java/io/confluent/connect/hdfs/parquet/ParquetRecordWriterProvider.java @@ -10,27 +10,37 @@ * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express * or implied. See the License for the specific language governing permissions and limitations under * the License. - **/ + */ + package io.confluent.connect.hdfs.parquet; -import org.apache.avro.Schema; import org.apache.avro.generic.GenericRecord; -import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; +import org.apache.kafka.connect.data.Schema; +import org.apache.kafka.connect.errors.ConnectException; import org.apache.kafka.connect.sink.SinkRecord; import org.apache.parquet.avro.AvroParquetWriter; import org.apache.parquet.hadoop.ParquetWriter; import org.apache.parquet.hadoop.metadata.CompressionCodecName; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import java.io.IOException; import io.confluent.connect.avro.AvroData; -import io.confluent.connect.hdfs.RecordWriterProvider; +import io.confluent.connect.hdfs.HdfsSinkConnectorConfig; import io.confluent.connect.hdfs.RecordWriter; +import io.confluent.connect.hdfs.RecordWriterProvider; -public class ParquetRecordWriterProvider implements RecordWriterProvider { - +public class ParquetRecordWriterProvider + implements io.confluent.connect.storage.format.RecordWriterProvider { + private static final Logger log = LoggerFactory.getLogger(ParquetRecordWriterProvider.class); private final static String EXTENSION = ".parquet"; + private final AvroData avroData; + + ParquetRecordWriterProvider(AvroData avroData) { + this.avroData = avroData; + } @Override public String getExtension() { @@ -38,29 +48,60 @@ public String getExtension() { } @Override - public RecordWriter getRecordWriter( - Configuration conf, final String fileName, SinkRecord record, final AvroData avroData) - throws IOException { - final Schema avroSchema = avroData.fromConnectSchema(record.valueSchema()); - CompressionCodecName compressionCodecName = CompressionCodecName.SNAPPY; + public io.confluent.connect.storage.format.RecordWriter getRecordWriter( + final HdfsSinkConnectorConfig conf, + final String filename + ) { + return new io.confluent.connect.storage.format.RecordWriter() { + final CompressionCodecName compressionCodecName = CompressionCodecName.SNAPPY; + final int blockSize = 256 * 1024 * 1024; + final int pageSize = 64 * 1024; + final Path path = new Path(filename); + Schema schema = null; + ParquetWriter writer = null; - int blockSize = 256 * 1024 * 1024; - int pageSize = 64 * 1024; - - Path path = new Path(fileName); - final ParquetWriter writer = - new AvroParquetWriter<>(path, avroSchema, compressionCodecName, blockSize, pageSize, true, conf); - - return new RecordWriter() { @Override - public void write(SinkRecord record) throws IOException { + public void write(SinkRecord record) { + if (schema == null) { + schema = record.valueSchema(); + try { + log.info("Opening record writer for: {}", filename); + org.apache.avro.Schema avroSchema = avroData.fromConnectSchema(schema); + writer = new AvroParquetWriter<>( + path, + avroSchema, + compressionCodecName, + blockSize, + pageSize, + true, + conf.getHadoopConfiguration() + ); + } catch (IOException e) { + throw new ConnectException(e); + } + } + + log.trace("Sink record: {}", record.toString()); Object value = avroData.fromConnectData(record.valueSchema(), record.value()); - writer.write((GenericRecord) value); + try { + writer.write((GenericRecord) value); + } catch (IOException e) { + throw new ConnectException(e); + } } @Override - public void close() throws IOException { - writer.close(); + public void commit() {} + + @Override + public void close() { + if (writer != null) { + try { + writer.close(); + } catch(IOException e){ + throw new ConnectException(e); + } + } } }; } diff --git a/src/main/java/io/confluent/connect/hdfs/partitioner/DailyPartitioner.java b/src/main/java/io/confluent/connect/hdfs/partitioner/DailyPartitioner.java index ece620b74..191d1a738 100644 --- a/src/main/java/io/confluent/connect/hdfs/partitioner/DailyPartitioner.java +++ b/src/main/java/io/confluent/connect/hdfs/partitioner/DailyPartitioner.java @@ -14,40 +14,10 @@ package io.confluent.connect.hdfs.partitioner; -import org.apache.kafka.common.config.ConfigException; -import org.joda.time.DateTimeZone; +import org.apache.hadoop.hive.metastore.api.FieldSchema; -import java.util.Locale; -import java.util.Map; -import java.util.concurrent.TimeUnit; - -import io.confluent.connect.hdfs.HdfsSinkConnectorConfig; - -public class DailyPartitioner extends TimeBasedPartitioner { - - private static long partitionDurationMs = TimeUnit.HOURS.toMillis(24); - private static String pathFormat = "'year'=YYYY/'month'=MM/'day'=dd/"; - - @Override - public void configure(Map config) { - String localeString = (String) config.get(HdfsSinkConnectorConfig.LOCALE_CONFIG); - if (localeString.equals("")) { - throw new ConfigException(HdfsSinkConnectorConfig.LOCALE_CONFIG, - localeString, "Locale cannot be empty."); - } - String timeZoneString = (String) config.get(HdfsSinkConnectorConfig.TIMEZONE_CONFIG); - if (timeZoneString.equals("")) { - throw new ConfigException(HdfsSinkConnectorConfig.TIMEZONE_CONFIG, - timeZoneString, "Timezone cannot be empty."); - } - String hiveIntString = (String) config.get(HdfsSinkConnectorConfig.HIVE_INTEGRATION_CONFIG); - boolean hiveIntegration = hiveIntString != null && hiveIntString.toLowerCase().equals("true"); - Locale locale = new Locale(localeString); - DateTimeZone timeZone = DateTimeZone.forID(timeZoneString); - init(partitionDurationMs, pathFormat, locale, timeZone, hiveIntegration); - } - - public String getPathFormat() { - return pathFormat; - } +@Deprecated +public class DailyPartitioner + extends io.confluent.connect.storage.partitioner.DailyPartitioner + implements Partitioner { } diff --git a/src/main/java/io/confluent/connect/hdfs/partitioner/DefaultPartitioner.java b/src/main/java/io/confluent/connect/hdfs/partitioner/DefaultPartitioner.java index b217085e5..29569de73 100644 --- a/src/main/java/io/confluent/connect/hdfs/partitioner/DefaultPartitioner.java +++ b/src/main/java/io/confluent/connect/hdfs/partitioner/DefaultPartitioner.java @@ -15,35 +15,9 @@ package io.confluent.connect.hdfs.partitioner; import org.apache.hadoop.hive.metastore.api.FieldSchema; -import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory; -import org.apache.kafka.connect.sink.SinkRecord; -import java.util.ArrayList; -import java.util.List; -import java.util.Map; - -public class DefaultPartitioner implements Partitioner { - - private static final String partitionField = "partition"; - private final List partitionFields = new ArrayList<>();; - - @Override - public void configure(Map config) { - partitionFields.add(new FieldSchema(partitionField, TypeInfoFactory.stringTypeInfo.toString(), "")); - } - - @Override - public String encodePartition(SinkRecord sinkRecord) { - return partitionField + "=" + String.valueOf(sinkRecord.kafkaPartition()); - } - - @Override - public String generatePartitionedPath(String topic, String encodedPartition) { - return topic + "/" + encodedPartition; - } - - @Override - public List partitionFields() { - return partitionFields; - } +@Deprecated +public class DefaultPartitioner + extends io.confluent.connect.storage.partitioner.DefaultPartitioner + implements Partitioner { } diff --git a/src/main/java/io/confluent/connect/hdfs/partitioner/FieldPartitioner.java b/src/main/java/io/confluent/connect/hdfs/partitioner/FieldPartitioner.java index 97a5f720d..cd5d0d0d0 100644 --- a/src/main/java/io/confluent/connect/hdfs/partitioner/FieldPartitioner.java +++ b/src/main/java/io/confluent/connect/hdfs/partitioner/FieldPartitioner.java @@ -15,69 +15,9 @@ package io.confluent.connect.hdfs.partitioner; import org.apache.hadoop.hive.metastore.api.FieldSchema; -import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory; -import org.apache.kafka.connect.data.Schema; -import org.apache.kafka.connect.data.Schema.Type; -import org.apache.kafka.connect.data.Struct; -import org.apache.kafka.connect.sink.SinkRecord; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; -import java.util.ArrayList; -import java.util.List; -import java.util.Map; - -import io.confluent.connect.hdfs.HdfsSinkConnectorConfig; -import io.confluent.connect.hdfs.errors.PartitionException; - -public class FieldPartitioner implements Partitioner { - private static final Logger log = LoggerFactory.getLogger(FieldPartitioner.class); - private static String fieldName; - private List partitionFields = new ArrayList<>(); - - @Override - public void configure(Map config) { - fieldName = (String) config.get(HdfsSinkConnectorConfig.PARTITION_FIELD_NAME_CONFIG); - partitionFields.add(new FieldSchema(fieldName, TypeInfoFactory.stringTypeInfo.toString(), "")); - } - - @Override - public String encodePartition(SinkRecord sinkRecord) { - Object value = sinkRecord.value(); - Schema valueSchema = sinkRecord.valueSchema(); - if (value instanceof Struct) { - Struct struct = (Struct) value; - Object partitionKey = struct.get(fieldName); - Type type = valueSchema.field(fieldName).schema().type(); - switch (type) { - case INT8: - case INT16: - case INT32: - case INT64: - Number record = (Number) partitionKey; - return fieldName + "=" + record.toString(); - case STRING: - return fieldName + "=" + (String) partitionKey; - case BOOLEAN: - boolean booleanRecord = (boolean) partitionKey; - return fieldName + "=" + Boolean.toString(booleanRecord); - default: - log.error("Type {} is not supported as a partition key.", type.getName()); - throw new PartitionException("Error encoding partition."); - } - } else { - log.error("Value is not Struct type."); - throw new PartitionException("Error encoding partition."); - } - } - - @Override - public String generatePartitionedPath(String topic, String encodedPartition) { - return topic + "/" + encodedPartition; - } - - @Override - public List partitionFields() { - return partitionFields; - } +@Deprecated +public class FieldPartitioner + extends io.confluent.connect.storage.partitioner.FieldPartitioner + implements Partitioner { } diff --git a/src/main/java/io/confluent/connect/hdfs/partitioner/HourlyPartitioner.java b/src/main/java/io/confluent/connect/hdfs/partitioner/HourlyPartitioner.java index 777da994c..d7de2ea15 100644 --- a/src/main/java/io/confluent/connect/hdfs/partitioner/HourlyPartitioner.java +++ b/src/main/java/io/confluent/connect/hdfs/partitioner/HourlyPartitioner.java @@ -14,40 +14,10 @@ package io.confluent.connect.hdfs.partitioner; -import org.apache.kafka.common.config.ConfigException; -import org.joda.time.DateTimeZone; +import org.apache.hadoop.hive.metastore.api.FieldSchema; -import java.util.Locale; -import java.util.Map; -import java.util.concurrent.TimeUnit; - -import io.confluent.connect.hdfs.HdfsSinkConnectorConfig; - -public class HourlyPartitioner extends TimeBasedPartitioner { - - private static long partitionDurationMs = TimeUnit.HOURS.toMillis(1); - private static String pathFormat = "'year'=YYYY/'month'=MM/'day'=dd/'hour'=HH/"; - - @Override - public void configure(Map config) { - String localeString = (String) config.get(HdfsSinkConnectorConfig.LOCALE_CONFIG); - if (localeString.equals("")) { - throw new ConfigException(HdfsSinkConnectorConfig.LOCALE_CONFIG, - localeString, "Locale cannot be empty."); - } - String timeZoneString = (String) config.get(HdfsSinkConnectorConfig.TIMEZONE_CONFIG); - if (timeZoneString.equals("")) { - throw new ConfigException(HdfsSinkConnectorConfig.TIMEZONE_CONFIG, - timeZoneString, "Timezone cannot be empty."); - } - String hiveIntString = (String) config.get(HdfsSinkConnectorConfig.HIVE_INTEGRATION_CONFIG); - boolean hiveIntegration = hiveIntString != null && hiveIntString.toLowerCase().equals("true"); - Locale locale = new Locale(localeString); - DateTimeZone timeZone = DateTimeZone.forID(timeZoneString); - init(partitionDurationMs, pathFormat, locale, timeZone, hiveIntegration); - } - - public String getPathFormat() { - return pathFormat; - } +@Deprecated +public class HourlyPartitioner + extends io.confluent.connect.storage.partitioner.HourlyPartitioner + implements Partitioner { } diff --git a/src/main/java/io/confluent/connect/hdfs/partitioner/Partitioner.java b/src/main/java/io/confluent/connect/hdfs/partitioner/Partitioner.java index 1effdd102..221324cc8 100644 --- a/src/main/java/io/confluent/connect/hdfs/partitioner/Partitioner.java +++ b/src/main/java/io/confluent/connect/hdfs/partitioner/Partitioner.java @@ -24,9 +24,15 @@ * Partition incoming records, and generates directories and file names in which to store the * incoming records. */ -public interface Partitioner { +@Deprecated +public interface Partitioner + extends io.confluent.connect.storage.partitioner.Partitioner { + @Override void configure(Map config); + @Override String encodePartition(SinkRecord sinkRecord); + @Override String generatePartitionedPath(String topic, String encodedPartition); + @Override List partitionFields(); } diff --git a/src/main/java/io/confluent/connect/hdfs/partitioner/TimeBasedPartitioner.java b/src/main/java/io/confluent/connect/hdfs/partitioner/TimeBasedPartitioner.java index 359b10820..d96440270 100644 --- a/src/main/java/io/confluent/connect/hdfs/partitioner/TimeBasedPartitioner.java +++ b/src/main/java/io/confluent/connect/hdfs/partitioner/TimeBasedPartitioner.java @@ -15,115 +15,9 @@ package io.confluent.connect.hdfs.partitioner; import org.apache.hadoop.hive.metastore.api.FieldSchema; -import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory; -import org.apache.kafka.common.config.ConfigException; -import org.apache.kafka.connect.sink.SinkRecord; -import org.joda.time.DateTime; -import org.joda.time.DateTimeZone; -import org.joda.time.format.DateTimeFormat; -import org.joda.time.format.DateTimeFormatter; -import java.util.ArrayList; -import java.util.List; -import java.util.Locale; -import java.util.Map; -import java.util.regex.Matcher; -import java.util.regex.Pattern; - -import io.confluent.connect.hdfs.HdfsSinkConnectorConfig; - -public class TimeBasedPartitioner implements Partitioner { - - // Duration of a partition in milliseconds. - private long partitionDurationMs; - private DateTimeFormatter formatter; - protected List partitionFields = new ArrayList<>(); - private static String patternString = "'year'=Y{1,5}/('month'=M{1,5}/)?('day'=d{1,3}/)?('hour'=H{1,3}/)?('minute'=m{1,3}/)?"; - private static Pattern pattern = Pattern.compile(patternString); - - protected void init(long partitionDurationMs, String pathFormat, Locale locale, - DateTimeZone timeZone, boolean hiveIntegration) { - this.partitionDurationMs = partitionDurationMs; - this.formatter = getDateTimeFormatter(pathFormat, timeZone).withLocale(locale); - addToPartitionFields(pathFormat, hiveIntegration); - } - - private static DateTimeFormatter getDateTimeFormatter(String str, DateTimeZone timeZone) { - return DateTimeFormat.forPattern(str).withZone(timeZone); - } - - public static long getPartition(long timeGranularityMs, long timestamp, DateTimeZone timeZone) { - long adjustedTimeStamp = timeZone.convertUTCToLocal(timestamp); - long partitionedTime = (adjustedTimeStamp / timeGranularityMs) * timeGranularityMs; - return timeZone.convertLocalToUTC(partitionedTime, false); - } - - @Override - public void configure(Map config) { - long partitionDurationMs = (long) config.get(HdfsSinkConnectorConfig.PARTITION_DURATION_MS_CONFIG); - if (partitionDurationMs < 0) { - throw new ConfigException(HdfsSinkConnectorConfig.PARTITION_DURATION_MS_CONFIG, - partitionDurationMs, "Partition duration needs to be a positive."); - } - - String pathFormat = (String) config.get(HdfsSinkConnectorConfig.PATH_FORMAT_CONFIG); - if (pathFormat.equals("")) { - throw new ConfigException(HdfsSinkConnectorConfig.PATH_FORMAT_CONFIG, - pathFormat, "Path format cannot be empty."); - } - - String localeString = (String) config.get(HdfsSinkConnectorConfig.LOCALE_CONFIG); - if (localeString.equals("")) { - throw new ConfigException(HdfsSinkConnectorConfig.LOCALE_CONFIG, - localeString, "Locale cannot be empty."); - } - String timeZoneString = (String) config.get(HdfsSinkConnectorConfig.TIMEZONE_CONFIG); - if (timeZoneString.equals("")) { - throw new ConfigException(HdfsSinkConnectorConfig.TIMEZONE_CONFIG, - timeZoneString, "Timezone cannot be empty."); - } - - String hiveIntString = (String) config.get(HdfsSinkConnectorConfig.HIVE_INTEGRATION_CONFIG); - boolean hiveIntegration = hiveIntString != null && hiveIntString.toLowerCase().equals("true"); - - Locale locale = new Locale(localeString); - DateTimeZone timeZone = DateTimeZone.forID(timeZoneString); - init(partitionDurationMs, pathFormat, locale, timeZone, hiveIntegration); - } - - @Override - public String encodePartition(SinkRecord sinkRecord) { - long timestamp = System.currentTimeMillis(); - DateTime bucket = new DateTime(getPartition(partitionDurationMs, timestamp, formatter.getZone())); - return bucket.toString(formatter); - } - - - @Override - public String generatePartitionedPath(String topic, String encodedPartition) { - return topic + "/" + encodedPartition; - } - - @Override - public List partitionFields() { - return partitionFields; - } - - private boolean verifyDateTimeFormat(String pathFormat) { - Matcher m = pattern.matcher(pathFormat); - return m.matches(); - } - - private void addToPartitionFields(String pathFormat, boolean hiveIntegration) { - if (hiveIntegration && !verifyDateTimeFormat(pathFormat)) { - throw new ConfigException(HdfsSinkConnectorConfig.PATH_FORMAT_CONFIG, pathFormat, - "Path format doesn't meet the requirements for Hive integration, " - + "which require prefixing each DateTime component with its name."); - } - for (String field: pathFormat.split("/")) { - String[] parts = field.split("="); - FieldSchema fieldSchema = new FieldSchema(parts[0].replace("'", ""), TypeInfoFactory.stringTypeInfo.toString(), ""); - partitionFields.add(fieldSchema); - } - } +@Deprecated +public class TimeBasedPartitioner + extends io.confluent.connect.storage.partitioner.TimeBasedPartitioner + implements Partitioner { } diff --git a/src/main/java/io/confluent/connect/hdfs/schema/Compatibility.java b/src/main/java/io/confluent/connect/hdfs/schema/Compatibility.java index 27333bf5c..62d0cc893 100644 --- a/src/main/java/io/confluent/connect/hdfs/schema/Compatibility.java +++ b/src/main/java/io/confluent/connect/hdfs/schema/Compatibility.java @@ -14,6 +14,7 @@ package io.confluent.connect.hdfs.schema; +@Deprecated public enum Compatibility { NONE, BACKWARD, diff --git a/src/main/java/io/confluent/connect/hdfs/schema/SchemaUtils.java b/src/main/java/io/confluent/connect/hdfs/schema/SchemaUtils.java index c7e695ffd..21c586c75 100644 --- a/src/main/java/io/confluent/connect/hdfs/schema/SchemaUtils.java +++ b/src/main/java/io/confluent/connect/hdfs/schema/SchemaUtils.java @@ -19,6 +19,7 @@ import org.apache.kafka.connect.errors.SchemaProjectorException; import org.apache.kafka.connect.sink.SinkRecord; +@Deprecated public class SchemaUtils { public static Compatibility getCompatibility(String compatibilityString) { diff --git a/src/main/java/io/confluent/connect/hdfs/storage/HdfsStorage.java b/src/main/java/io/confluent/connect/hdfs/storage/HdfsStorage.java index 5069fbf7e..b0c6e6cd5 100644 --- a/src/main/java/io/confluent/connect/hdfs/storage/HdfsStorage.java +++ b/src/main/java/io/confluent/connect/hdfs/storage/HdfsStorage.java @@ -16,81 +16,116 @@ package io.confluent.connect.hdfs.storage; -import org.apache.hadoop.conf.Configuration; +import org.apache.avro.file.SeekableInput; +import org.apache.avro.mapred.FsInput; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.fs.PathFilter; import org.apache.kafka.common.TopicPartition; +import org.apache.kafka.connect.errors.ConnectException; import java.io.IOException; +import java.io.OutputStream; import java.net.URI; +import java.util.Arrays; +import java.util.List; +import io.confluent.connect.hdfs.HdfsSinkConnectorConfig; import io.confluent.connect.hdfs.wal.FSWAL; import io.confluent.connect.hdfs.wal.WAL; -public class HdfsStorage implements Storage { +public class HdfsStorage + implements io.confluent.connect.storage.Storage>, + Storage { private final FileSystem fs; - private final Configuration conf; + private final HdfsSinkConnectorConfig conf; private final String url; - public HdfsStorage(Configuration conf, String url) throws IOException { - fs = FileSystem.newInstance(URI.create(url), conf); + // Visible for testing. + protected HdfsStorage(HdfsSinkConnectorConfig conf, String url, FileSystem fs) { this.conf = conf; this.url = url; + this.fs = fs; } - @Override - public FileStatus[] listStatus(String path, PathFilter filter) throws IOException { - return fs.listStatus(new Path(path), filter); + public HdfsStorage(HdfsSinkConnectorConfig conf, String url) throws IOException { + this.conf = conf; + this.url = url; + fs = FileSystem.newInstance(URI.create(url), conf.getHadoopConfiguration()); } - @Override - public FileStatus[] listStatus(String path) throws IOException { - return fs.listStatus(new Path(path)); + public List list(String path, PathFilter filter) { + try { + return Arrays.asList(fs.listStatus(new Path(path), filter)); + } catch (IOException e) { + throw new ConnectException(e); + } } @Override - public void append(String filename, Object object) throws IOException { - + public List list(String path) { + try { + return Arrays.asList(fs.listStatus(new Path(path))); + } catch (IOException e) { + throw new ConnectException(e); + } } @Override - public boolean mkdirs(String filename) throws IOException { - return fs.mkdirs(new Path(filename)); + public OutputStream append(String filename) { + throw new UnsupportedOperationException(); } @Override - public boolean exists(String filename) throws IOException { - return fs.exists(new Path(filename)); + public boolean create(String filename) { + try { + return fs.mkdirs(new Path(filename)); + } catch (IOException e) { + throw new ConnectException(e); + } } @Override - public void commit(String tempFile, String committedFile) throws IOException { - renameFile(tempFile, committedFile); + public boolean exists(String filename) { + try { + return fs.exists(new Path(filename)); + } catch (IOException e) { + throw new ConnectException(e); + } } + public void commit(String tempFile, String committedFile) { + renameFile(tempFile, committedFile); + } @Override - public void delete(String filename) throws IOException { - fs.delete(new Path(filename), true); + public void delete(String filename) { + try { + fs.delete(new Path(filename), true); + } catch (IOException e) { + throw new ConnectException(e); + } } @Override - public void close() throws IOException { + public void close() { if (fs != null) { - fs.close(); + try { + fs.close(); + } catch (IOException e) { + throw new ConnectException(e); + } } } - @Override public WAL wal(String topicsDir, TopicPartition topicPart) { return new FSWAL(topicsDir, topicPart, this); } @Override - public Configuration conf() { + public HdfsSinkConnectorConfig conf() { return conf; } @@ -99,14 +134,41 @@ public String url() { return url; } - private void renameFile(String sourcePath, String targetPath) throws IOException { + private void renameFile(String sourcePath, String targetPath) { if (sourcePath.equals(targetPath)) { return; } - final Path srcPath = new Path(sourcePath); - final Path dstPath = new Path(targetPath); - if (fs.exists(srcPath)) { - fs.rename(srcPath, dstPath); + try { + final Path srcPath = new Path(sourcePath); + final Path dstPath = new Path(targetPath); + if (fs.exists(srcPath)) { + fs.rename(srcPath, dstPath); + } + } catch (IOException e) { + throw new ConnectException(e); + } + } + + @Override + public SeekableInput open(String filename, HdfsSinkConnectorConfig conf) { + try { + return new FsInput(new Path(filename), conf.getHadoopConfiguration()); + } catch (IOException e) { + throw new ConnectException(e); + } + } + + public OutputStream create(String filename, boolean overwrite) { + return create(filename, this.conf, overwrite); + } + + @Override + public OutputStream create(String filename, HdfsSinkConnectorConfig conf, boolean overwrite) { + try { + Path path = new Path(filename); + return path.getFileSystem(conf.getHadoopConfiguration()).create(path); + } catch (IOException e) { + throw new ConnectException(e); } } } diff --git a/src/main/java/io/confluent/connect/hdfs/storage/Storage.java b/src/main/java/io/confluent/connect/hdfs/storage/Storage.java index e1f6a1f1b..598d5caa4 100644 --- a/src/main/java/io/confluent/connect/hdfs/storage/Storage.java +++ b/src/main/java/io/confluent/connect/hdfs/storage/Storage.java @@ -16,25 +16,12 @@ package io.confluent.connect.hdfs.storage; -import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileStatus; -import org.apache.hadoop.fs.PathFilter; -import org.apache.kafka.common.TopicPartition; -import java.io.IOException; +import java.util.List; -import io.confluent.connect.hdfs.wal.WAL; +import io.confluent.connect.hdfs.HdfsSinkConnectorConfig; -public interface Storage { - boolean exists(String filename) throws IOException; - boolean mkdirs(String filename) throws IOException; - void append(String filename, Object object) throws IOException; - void delete(String filename) throws IOException; - void commit(String tempFile, String committedFile) throws IOException; - void close() throws IOException; - WAL wal(String topicsDir, TopicPartition topicPart); - FileStatus[] listStatus(String path, PathFilter filter) throws IOException; - FileStatus[] listStatus(String path) throws IOException; - String url(); - Configuration conf(); -} +@Deprecated +public interface Storage + extends io.confluent.connect.storage.Storage> {} diff --git a/src/main/java/io/confluent/connect/hdfs/storage/StorageFactory.java b/src/main/java/io/confluent/connect/hdfs/storage/StorageFactory.java index 479442346..248e4a096 100644 --- a/src/main/java/io/confluent/connect/hdfs/storage/StorageFactory.java +++ b/src/main/java/io/confluent/connect/hdfs/storage/StorageFactory.java @@ -21,6 +21,7 @@ import java.lang.reflect.Constructor; import java.lang.reflect.InvocationTargetException; +@Deprecated public class StorageFactory { public static Storage createStorage(Class storageClass, Configuration conf, String url) { try { diff --git a/src/main/java/io/confluent/connect/hdfs/wal/FSWAL.java b/src/main/java/io/confluent/connect/hdfs/wal/FSWAL.java index c4ce18c65..bc3cb52a4 100644 --- a/src/main/java/io/confluent/connect/hdfs/wal/FSWAL.java +++ b/src/main/java/io/confluent/connect/hdfs/wal/FSWAL.java @@ -14,11 +14,11 @@ package io.confluent.connect.hdfs.wal; -import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.ipc.RemoteException; import org.apache.kafka.common.TopicPartition; import org.apache.kafka.connect.errors.ConnectException; +import org.apache.kafka.connect.errors.DataException; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -27,7 +27,8 @@ import java.util.Map; import io.confluent.connect.hdfs.FileUtils; -import io.confluent.connect.hdfs.storage.Storage; +import io.confluent.connect.hdfs.HdfsSinkConnectorConfig; +import io.confluent.connect.hdfs.storage.HdfsStorage; import io.confluent.connect.hdfs.wal.WALFile.Reader; import io.confluent.connect.hdfs.wal.WALFile.Writer; @@ -40,10 +41,10 @@ public class FSWAL implements WAL { private WALFile.Writer writer = null; private WALFile.Reader reader = null; private String logFile = null; - private Configuration conf = null; - private Storage storage = null; + private HdfsSinkConnectorConfig conf = null; + private HdfsStorage storage = null; - public FSWAL(String logsDir, TopicPartition topicPart, Storage storage) + public FSWAL(String logsDir, TopicPartition topicPart, HdfsStorage storage) throws ConnectException { this.storage = storage; this.conf = storage.conf(); @@ -60,7 +61,7 @@ public void append(String tempFile, String committedFile) throws ConnectExceptio writer.append(key, value); writer.hsync(); } catch (IOException e) { - throw new ConnectException(e); + throw new DataException(e); } } @@ -88,7 +89,7 @@ public void acquireLease() throws ConnectException { throw new ConnectException(e); } } catch (IOException e) { - throw new ConnectException("Error creating writer for log file " + logFile, e); + throw new DataException("Error creating writer for log file " + logFile, e); } } if (sleepIntervalMs >= MAX_SLEEP_INTERVAL_MS) { @@ -104,7 +105,7 @@ public void apply() throws ConnectException { } acquireLease(); if (reader == null) { - reader = new WALFile.Reader(conf, Reader.file(new Path(logFile))); + reader = new WALFile.Reader(conf.getHadoopConfiguration(), Reader.file(new Path(logFile))); } Map entries = new HashMap<>(); WALEntry key = new WALEntry(); @@ -128,22 +129,18 @@ public void apply() throws ConnectException { } } } catch (IOException e) { - throw new ConnectException(e); + throw new DataException(e); } } @Override public void truncate() throws ConnectException { - try { - String oldLogFile = logFile + ".1"; - storage.delete(oldLogFile); - storage.commit(logFile, oldLogFile); - // Clean out references to the current WAL file. - // Open a new one on the next lease acquisition. - close(); - } catch (IOException e) { - throw new ConnectException(e); - } + String oldLogFile = logFile + ".1"; + storage.delete(oldLogFile); + storage.commit(logFile, oldLogFile); + // Clean out references to the current WAL file. + // Open a new one on the next lease acquisition. + close(); } @Override @@ -158,7 +155,7 @@ public void close() throws ConnectException { reader = null; } } catch (IOException e) { - throw new ConnectException("Error closing " + logFile, e); + throw new DataException("Error closing " + logFile, e); } } diff --git a/src/main/java/io/confluent/connect/hdfs/wal/WAL.java b/src/main/java/io/confluent/connect/hdfs/wal/WAL.java index f14bf1276..6845a18af 100644 --- a/src/main/java/io/confluent/connect/hdfs/wal/WAL.java +++ b/src/main/java/io/confluent/connect/hdfs/wal/WAL.java @@ -16,15 +16,6 @@ package io.confluent.connect.hdfs.wal; -import org.apache.kafka.connect.errors.ConnectException; - -public interface WAL { - String beginMarker = "BEGIN"; - String endMarker = "END"; - void acquireLease() throws ConnectException; - void append(String tempFile, String committedFile) throws ConnectException; - void apply() throws ConnectException; - void truncate() throws ConnectException; - void close() throws ConnectException; - String getLogFile(); +@Deprecated +public interface WAL extends io.confluent.connect.storage.wal.WAL { } diff --git a/src/main/java/io/confluent/connect/hdfs/wal/WALFile.java b/src/main/java/io/confluent/connect/hdfs/wal/WALFile.java index 2e95cd73d..bc7fbb467 100644 --- a/src/main/java/io/confluent/connect/hdfs/wal/WALFile.java +++ b/src/main/java/io/confluent/connect/hdfs/wal/WALFile.java @@ -44,6 +44,8 @@ import java.security.MessageDigest; import java.util.Arrays; +import io.confluent.connect.hdfs.HdfsSinkConnectorConfig; + public class WALFile { private static final Log log = LogFactory.getLog(WALFile.class); @@ -63,14 +65,12 @@ public class WALFile { private WALFile() {} - public static Writer createWriter(Configuration conf, Writer.Option... opts) throws IOException { + public static Writer createWriter(HdfsSinkConnectorConfig conf, Writer.Option... opts) throws IOException { return new Writer(conf, opts); } public static class Writer implements Closeable, Syncable { - - private Configuration conf; private FSDataOutputStream out; private DataOutputBuffer buffer = new DataOutputBuffer(); boolean ownOutputStream = true; @@ -170,7 +170,8 @@ static class AppendIfExistsOption extends Options.BooleanOption implements Optio } - Writer(Configuration conf, Option... opts) throws IOException { + Writer(HdfsSinkConnectorConfig connectorConfig, Option... opts) throws IOException { + Configuration conf = connectorConfig.getHadoopConfiguration(); BlockSizeOption blockSizeOption = Options.getOption(BlockSizeOption.class, opts); BufferSizeOption bufferSizeOption = @@ -227,12 +228,12 @@ static class AppendIfExistsOption extends Options.BooleanOption implements Optio out = streamOption.getValue(); } - init(conf, out, ownStream); + init(connectorConfig, out, ownStream); } - void init(Configuration conf, FSDataOutputStream out, boolean ownStream) + void init(HdfsSinkConnectorConfig connectorConfig, FSDataOutputStream out, boolean ownStream) throws IOException { - this.conf = conf; + Configuration conf = connectorConfig.getHadoopConfiguration(); this.out = out; this.ownOutputStream = ownStream; SerializationFactory serializationFactory = new SerializationFactory(conf); diff --git a/src/test/java/io/confluent/connect/hdfs/DataFileReader.java b/src/test/java/io/confluent/connect/hdfs/DataFileReader.java new file mode 100644 index 000000000..0c9724c92 --- /dev/null +++ b/src/test/java/io/confluent/connect/hdfs/DataFileReader.java @@ -0,0 +1,15 @@ +package io.confluent.connect.hdfs; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.Path; + +import java.io.IOException; +import java.util.Collection; + +/** + * Interface that corresponds to SchemaFileReader but reads data objects. Only used to validate + * output during tests. + */ +public interface DataFileReader { + Collection readData(Configuration conf, Path path) throws IOException; +} diff --git a/src/test/java/io/confluent/connect/hdfs/FailureRecoveryTest.java b/src/test/java/io/confluent/connect/hdfs/FailureRecoveryTest.java index 6e3ce5e78..81ca75270 100644 --- a/src/test/java/io/confluent/connect/hdfs/FailureRecoveryTest.java +++ b/src/test/java/io/confluent/connect/hdfs/FailureRecoveryTest.java @@ -29,6 +29,8 @@ import io.confluent.connect.hdfs.utils.MemoryFormat; import io.confluent.connect.hdfs.utils.MemoryRecordWriter; import io.confluent.connect.hdfs.utils.MemoryStorage; +import io.confluent.connect.storage.common.StorageCommonConfig; +import io.confluent.connect.storage.format.*; import static org.junit.Assert.assertEquals; @@ -44,16 +46,13 @@ public void setUp() throws Exception { @Override protected Map createProps() { Map props = super.createProps(); - props.put(HdfsSinkConnectorConfig.STORAGE_CLASS_CONFIG, MemoryStorage.class.getName()); + props.put(StorageCommonConfig.STORAGE_CLASS_CONFIG, MemoryStorage.class.getName()); props.put(HdfsSinkConnectorConfig.FORMAT_CLASS_CONFIG, MemoryFormat.class.getName()); return props; } @Test public void testCommitFailure() throws Exception { - Map props = createProps(); - HdfsSinkConnectorConfig connectorConfig = new HdfsSinkConnectorConfig(props); - String key = "key"; Schema schema = createSchema(); Struct record = createRecord(schema); @@ -87,15 +86,12 @@ public void testCommitFailure() throws Exception { content = data.get(logFile); assertEquals(6, content.size()); - hdfsWriter.close(assignment); + hdfsWriter.close(); hdfsWriter.stop(); } @Test public void testWriterFailureMultiPartitions() throws Exception { - Map props = createProps(); - HdfsSinkConnectorConfig connectorConfig = new HdfsSinkConnectorConfig(props); - String key = "key"; Schema schema = createSchema(); Struct record = createRecord(schema); @@ -121,7 +117,7 @@ public void testWriterFailureMultiPartitions() throws Exception { } String encodedPartition = "partition=" + String.valueOf(PARTITION); - Map> writers = hdfsWriter.getWriters(TOPIC_PARTITION); + Map writers = hdfsWriter.getWriters(TOPIC_PARTITION); MemoryRecordWriter writer = (MemoryRecordWriter) writers.get(encodedPartition); writer.setFailure(MemoryRecordWriter.Failure.writeFailure); hdfsWriter.write(sinkRecords); @@ -163,15 +159,13 @@ public void testWriterFailureMultiPartitions() throws Exception { } hdfsWriter.write(new ArrayList()); - hdfsWriter.close(assignment); + hdfsWriter.close(); hdfsWriter.stop(); } @Test public void testWriterFailure() throws Exception { - Map props = createProps(); - - HdfsSinkConnectorConfig connectorConfig = new HdfsSinkConnectorConfig(props); + HdfsSinkConnectorConfig connectorConfig = new HdfsSinkConnectorConfig(properties); String key = "key"; Schema schema = createSchema(); @@ -190,7 +184,7 @@ public void testWriterFailure() throws Exception { } String encodedPartition = "partition=" + String.valueOf(PARTITION); - Map> writers = hdfsWriter.getWriters(TOPIC_PARTITION); + Map writers = hdfsWriter.getWriters(TOPIC_PARTITION); MemoryRecordWriter writer = (MemoryRecordWriter) writers.get(encodedPartition); writer.setFailure(MemoryRecordWriter.Failure.writeFailure); @@ -226,7 +220,7 @@ public void testWriterFailure() throws Exception { } hdfsWriter.write(new ArrayList()); - hdfsWriter.close(assignment); + hdfsWriter.close(); hdfsWriter.stop(); } } diff --git a/src/test/java/io/confluent/connect/hdfs/FormatAPIDataWriterCompatibilityTest.java b/src/test/java/io/confluent/connect/hdfs/FormatAPIDataWriterCompatibilityTest.java new file mode 100644 index 000000000..2bcd0d4d5 --- /dev/null +++ b/src/test/java/io/confluent/connect/hdfs/FormatAPIDataWriterCompatibilityTest.java @@ -0,0 +1,75 @@ +package io.confluent.connect.hdfs; + +import org.apache.kafka.connect.data.Schema; +import org.apache.kafka.connect.data.Struct; +import org.apache.kafka.connect.sink.SinkRecord; +import org.junit.Before; +import org.junit.Test; + +import java.util.ArrayList; +import java.util.Collection; +import java.util.Map; + +import io.confluent.connect.hdfs.avro.AvroDataFileReader; +import io.confluent.connect.hdfs.hive.HiveTestBase; +import io.confluent.connect.storage.hive.HiveConfig; + +/** + * Test to ensure we can still instantiate & use the old-style HDFS-only interfaces instead of + * those from storage-common and use them with DataWriter + */ +public class FormatAPIDataWriterCompatibilityTest extends HiveTestBase { + + @Before + public void setUp() throws Exception { + super.setUp(); + dataFileReader = new AvroDataFileReader(); + extension = ".avro"; + } + + + @Test + public void dataWriterNewFormatAPICompatibilityTest() { + DataWriter hdfsWriter = new DataWriter(connectorConfig, context, avroData); + + hdfsWriter.recover(TOPIC_PARTITION); + + String key = "key"; + Schema schema = createSchema(); + Struct record = createRecord(schema); + + Collection sinkRecords = new ArrayList<>(); + for (long offset = 0; offset < 7; offset++) { + SinkRecord sinkRecord = + new SinkRecord(TOPIC, PARTITION, Schema.STRING_SCHEMA, key, schema, record, offset); + sinkRecords.add(sinkRecord); + } + + hdfsWriter.write(sinkRecords); + hdfsWriter.close(); + hdfsWriter.stop(); + + Map props = createProps(); + props.put(HiveConfig.HIVE_INTEGRATION_CONFIG, "true"); + HdfsSinkConnectorConfig config = new HdfsSinkConnectorConfig(props); + + hdfsWriter = new DataWriter(config, context, avroData); + hdfsWriter.syncWithHive(); + + // Since we're not using a real format, we won't validate the output. However, this should at + // least exercise the code paths for the old Format class + + hdfsWriter.close(); + hdfsWriter.stop(); + } + + @Override + protected Map createProps() { + Map props = super.createProps(); + props.put(HdfsSinkConnectorConfig.FORMAT_CLASS_CONFIG, OldFormat.class.getName()); + // Enable Hive integration to make sure we exercise the paths that get HiveUtils + props.put(HiveConfig.HIVE_INTEGRATION_CONFIG, "true"); + return props; + } + +} diff --git a/src/test/java/io/confluent/connect/hdfs/FormatAPITopicPartitionWriterCompatibilityTest.java b/src/test/java/io/confluent/connect/hdfs/FormatAPITopicPartitionWriterCompatibilityTest.java new file mode 100644 index 000000000..78b9795c4 --- /dev/null +++ b/src/test/java/io/confluent/connect/hdfs/FormatAPITopicPartitionWriterCompatibilityTest.java @@ -0,0 +1,115 @@ +package io.confluent.connect.hdfs; + +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.Path; +import org.apache.kafka.connect.data.Schema; +import org.apache.kafka.connect.data.Struct; +import org.apache.kafka.connect.sink.SinkRecord; +import org.junit.Before; +import org.junit.Test; + +import java.io.IOException; +import java.util.Collection; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; + +import io.confluent.connect.hdfs.avro.AvroDataFileReader; +import io.confluent.connect.hdfs.filter.CommittedFileFilter; +import io.confluent.connect.hdfs.partitioner.DefaultPartitioner; +import io.confluent.connect.hdfs.partitioner.Partitioner; +import io.confluent.connect.hdfs.storage.HdfsStorage; +import io.confluent.connect.storage.StorageFactory; +import io.confluent.connect.storage.common.StorageCommonConfig; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; + +/** + * Test to ensure we can still instantiate & use the old-style HDFS-only interfaces instead of + * those from storage-common and use them with TopicPartitionWriter + */ +public class FormatAPITopicPartitionWriterCompatibilityTest extends TestWithMiniDFSCluster { + private RecordWriterProvider writerProvider = null; + private io.confluent.connect.storage.format.RecordWriterProvider + newWriterProvider; + private HdfsStorage storage; + + @Override + protected Map createProps() { + return super.createProps(); + } + + @Before + public void setUp() throws Exception { + super.setUp(); + + @SuppressWarnings("unchecked") + Class storageClass = (Class) + connectorConfig.getClass(StorageCommonConfig.STORAGE_CLASS_CONFIG); + storage = StorageFactory.createStorage( + storageClass, + HdfsSinkConnectorConfig.class, + connectorConfig, + url + ); + + Format format = new OldFormat(); + writerProvider = format.getRecordWriterProvider(); + newWriterProvider = null; + dataFileReader = new AvroDataFileReader(); + extension = writerProvider.getExtension(); + createTopicDir(url, topicsDir, TOPIC); + createLogsDir(url, logsDir); + } + + @Test + public void testWriteRecordDefaultWithPadding() throws Exception { + Partitioner partitioner = new DefaultPartitioner(); + partitioner.configure(parsedConfig); + TopicPartitionWriter topicPartitionWriter = new TopicPartitionWriter( + TOPIC_PARTITION, + storage, + writerProvider, + newWriterProvider, + partitioner, + connectorConfig, + context, + avroData + ); + + Schema schema = createSchema(); + List records = createRecordBatches(schema, 3, 3); + // Add a single records at the end of the batches sequence. Total records: 10 + records.add(createRecord(schema)); + List sinkRecords = createSinkRecords(records, schema); + + for (SinkRecord record : sinkRecords) { + topicPartitionWriter.buffer(record); + } + + topicPartitionWriter.recover(); + topicPartitionWriter.write(); + topicPartitionWriter.close(); + + // No verification since the format is a dummy format. We're really just trying to exercise + // the old APIs and any paths that should hit them (and not NPE due to the variables for + // new-style formats being null) + } + + private void createTopicDir(String url, String topicsDir, String topic) throws IOException { + Path path = new Path(FileUtils.topicDirectory(url, topicsDir, topic)); + if (!fs.exists(path)) { + fs.mkdirs(path); + } + } + + private void createLogsDir(String url, String logsDir) throws IOException { + Path path = new Path(url + "/" + logsDir); + if (!fs.exists(path)) { + fs.mkdirs(path); + } + } + +} diff --git a/src/test/java/io/confluent/connect/hdfs/HdfsSinkConnectorTestBase.java b/src/test/java/io/confluent/connect/hdfs/HdfsSinkConnectorTestBase.java index 835a0b83d..142712935 100644 --- a/src/test/java/io/confluent/connect/hdfs/HdfsSinkConnectorTestBase.java +++ b/src/test/java/io/confluent/connect/hdfs/HdfsSinkConnectorTestBase.java @@ -19,58 +19,61 @@ import org.apache.hadoop.conf.Configuration; import org.apache.kafka.common.TopicPartition; import org.apache.kafka.connect.data.Schema; -import org.apache.kafka.connect.data.SchemaBuilder; import org.apache.kafka.connect.data.Struct; -import org.apache.kafka.connect.sink.SinkTaskContext; import org.junit.After; -import org.junit.Before; import java.util.ArrayList; import java.util.HashMap; -import java.util.HashSet; import java.util.List; import java.util.Map; -import java.util.Set; +import java.util.concurrent.TimeUnit; import io.confluent.connect.avro.AvroData; +import io.confluent.connect.hdfs.avro.AvroFormat; +import io.confluent.connect.hdfs.partitioner.DefaultPartitioner; +import io.confluent.connect.storage.StorageSinkTestBase; +import io.confluent.connect.storage.common.StorageCommonConfig; +import io.confluent.connect.storage.hive.schema.DefaultSchemaGenerator; +import io.confluent.connect.storage.partitioner.PartitionerConfig; -public class HdfsSinkConnectorTestBase { +public class HdfsSinkConnectorTestBase extends StorageSinkTestBase { - protected Configuration conf; - protected String url; - protected Map connectorProps; protected HdfsSinkConnectorConfig connectorConfig; + protected Map parsedConfig; + protected Configuration conf; protected String topicsDir; protected String logsDir; protected AvroData avroData; - protected MockSinkTaskContext context; - protected static final String TOPIC = "topic"; protected static final String TOPIC_WITH_DOTS = "topic.with.dots"; - protected static final int PARTITION = 12; - protected static final int PARTITION2 = 13; - protected static final int PARTITION3 = 14; - protected static final TopicPartition TOPIC_PARTITION = new TopicPartition(TOPIC, PARTITION); - protected static final TopicPartition TOPIC_PARTITION2 = new TopicPartition(TOPIC, PARTITION2); - protected static final TopicPartition TOPIC_PARTITION3 = new TopicPartition(TOPIC, PARTITION3); protected static final TopicPartition TOPIC_WITH_DOTS_PARTITION = new TopicPartition(TOPIC_WITH_DOTS, PARTITION); - protected static Set assignment; + @Override protected Map createProps() { - Map props = new HashMap<>(); + Map props = super.createProps(); + url = "memory://"; props.put(HdfsSinkConnectorConfig.HDFS_URL_CONFIG, url); + props.put(StorageCommonConfig.STORE_URL_CONFIG, url); props.put(HdfsSinkConnectorConfig.FLUSH_SIZE_CONFIG, "3"); - return props; - } + props.put( + StorageCommonConfig.STORAGE_CLASS_CONFIG, + "io.confluent.connect.hdfs.storage.HdfsStorage" + ); + props.put(HdfsSinkConnectorConfig.FORMAT_CLASS_CONFIG, AvroFormat.class.getName()); + props.put( + PartitionerConfig.PARTITIONER_CLASS_CONFIG, + DefaultPartitioner.class.getName() + ); + props.put(PartitionerConfig.PARTITION_FIELD_NAME_CONFIG, "int"); + props.put( + PartitionerConfig.PARTITION_DURATION_MS_CONFIG, + String.valueOf(TimeUnit.HOURS.toMillis(1)) + ); + props.put(PartitionerConfig.PATH_FORMAT_CONFIG, "'year'=YYYY/'month'=MM/'day'=dd/'hour'=HH/"); + props.put(PartitionerConfig.LOCALE_CONFIG, "en"); + props.put(PartitionerConfig.TIMEZONE_CONFIG, "America/Los_Angeles"); - protected Schema createSchema() { - return SchemaBuilder.struct().name("record").version(1) - .field("boolean", Schema.BOOLEAN_SCHEMA) - .field("int", Schema.INT32_SCHEMA) - .field("long", Schema.INT64_SCHEMA) - .field("float", Schema.FLOAT32_SCHEMA) - .field("double", Schema.FLOAT64_SCHEMA) - .build(); + return props; } protected Struct createRecord(Schema schema, int ibase, float fbase) { @@ -82,31 +85,6 @@ protected Struct createRecord(Schema schema, int ibase, float fbase) { .put("double", (double) fbase); } - protected Struct createRecord(Schema schema) { - return createRecord(schema, 12, 12.2f); - } - - protected Schema createNewSchema() { - return SchemaBuilder.struct().name("record").version(2) - .field("boolean", Schema.BOOLEAN_SCHEMA) - .field("int", Schema.INT32_SCHEMA) - .field("long", Schema.INT64_SCHEMA) - .field("float", Schema.FLOAT32_SCHEMA) - .field("double", Schema.FLOAT64_SCHEMA) - .field("string", SchemaBuilder.string().defaultValue("abc").build()) - .build(); - } - - protected Struct createNewRecord(Schema newSchema) { - return new Struct(newSchema) - .put("boolean", true) - .put("int", 12) - .put("long", 12L) - .put("float", 12.2f) - .put("double", 12.2) - .put("string", "def"); - } - // Create a batch of records with incremental numeric field values. Total number of records is // given by 'size'. protected List createRecordBatch(Schema schema, int size) { @@ -129,93 +107,23 @@ protected List createRecordBatches(Schema schema, int batchSize, int bat return records; } - @Before + //@Before + @Override public void setUp() throws Exception { - conf = new Configuration(); - url = "memory://"; - connectorProps = createProps(); - // Configure immediately in setup for common case of just using this default. Subclasses can - // re-call this safely. - configureConnector(); - assignment = new HashSet<>(); - assignment.add(TOPIC_PARTITION); - assignment.add(TOPIC_PARTITION2); - context = new MockSinkTaskContext(); + super.setUp(); + connectorConfig = new HdfsSinkConnectorConfig(properties); + parsedConfig = new HashMap<>(connectorConfig.plainValues()); + conf = connectorConfig.getHadoopConfiguration(); + topicsDir = connectorConfig.getString(StorageCommonConfig.TOPICS_DIR_CONFIG); + logsDir = connectorConfig.getString(HdfsSinkConnectorConfig.LOGS_DIR_CONFIG); + avroData = new AvroData( + connectorConfig.getInt(HdfsSinkConnectorConfig.SCHEMA_CACHE_SIZE_CONFIG) + ); } @After + @Override public void tearDown() throws Exception { - if (assignment != null) { - assignment.clear(); - } - } - - protected void configureConnector() { - connectorConfig = new HdfsSinkConnectorConfig(connectorProps); - topicsDir = connectorConfig.getString(HdfsSinkConnectorConfig.TOPICS_DIR_CONFIG); - logsDir = connectorConfig.getString(HdfsSinkConnectorConfig.LOGS_DIR_CONFIG); - int schemaCacheSize = connectorConfig.getInt(HdfsSinkConnectorConfig.SCHEMA_CACHE_SIZE_CONFIG); - avroData = new AvroData(schemaCacheSize); - } - - protected static class MockSinkTaskContext implements SinkTaskContext { - - private Map offsets; - private long timeoutMs; - - public MockSinkTaskContext() { - this.offsets = new HashMap<>(); - this.timeoutMs = -1L; - } - - @Override - public void offset(Map offsets) { - this.offsets.putAll(offsets); - } - - @Override - public void offset(TopicPartition tp, long offset) { - offsets.put(tp, offset); - } - - /** - * Get offsets that the SinkTask has submitted to be reset. Used by the Copycat framework. - * @return the map of offsets - */ - public Map offsets() { - return offsets; - } - - @Override - public void timeout(long timeoutMs) { - this.timeoutMs = timeoutMs; - } - - /** - * Get the timeout in milliseconds set by SinkTasks. Used by the Copycat framework. - * @return the backoff timeout in milliseconds. - */ - public long timeout() { - return timeoutMs; - } - - /** - * Get the timeout in milliseconds set by SinkTasks. Used by the Copycat framework. - * @return the backoff timeout in milliseconds. - */ - - @Override - public Set assignment() { - return assignment; - } - - @Override - public void pause(TopicPartition... partitions) {} - - @Override - public void resume(TopicPartition... partitions) {} - - @Override - public void requestCommit() {} + super.tearDown(); } } diff --git a/src/test/java/io/confluent/connect/hdfs/HdfsSinkTaskTest.java b/src/test/java/io/confluent/connect/hdfs/HdfsSinkTaskTest.java index bbb781d74..32ba8a035 100644 --- a/src/test/java/io/confluent/connect/hdfs/HdfsSinkTaskTest.java +++ b/src/test/java/io/confluent/connect/hdfs/HdfsSinkTaskTest.java @@ -14,7 +14,6 @@ package io.confluent.connect.hdfs; -import io.confluent.kafka.serializers.NonRecordContainer; import org.apache.hadoop.fs.Path; import org.apache.kafka.common.TopicPartition; import org.apache.kafka.connect.data.Schema; @@ -30,10 +29,12 @@ import java.util.Map; import io.confluent.connect.avro.AvroData; +import io.confluent.connect.hdfs.avro.AvroDataFileReader; import io.confluent.connect.hdfs.avro.AvroFileReader; -import io.confluent.connect.hdfs.storage.Storage; -import io.confluent.connect.hdfs.storage.StorageFactory; -import io.confluent.connect.hdfs.wal.WAL; +import io.confluent.connect.hdfs.storage.HdfsStorage; +import io.confluent.connect.storage.StorageFactory; +import io.confluent.connect.storage.common.StorageCommonConfig; +import io.confluent.connect.storage.wal.WAL; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertTrue; @@ -44,17 +45,16 @@ public class HdfsSinkTaskTest extends TestWithMiniDFSCluster { private static final String DIRECTORY2 = TOPIC + "/" + "partition=" + String.valueOf(PARTITION2); private static final String extension = ".avro"; private static final String ZERO_PAD_FMT = "%010d"; - private final SchemaFileReader schemaFileReader = new AvroFileReader(avroData); + private final DataFileReader schemaFileReader = new AvroDataFileReader(); @Test public void testSinkTaskStart() throws Exception { + setUp(); createCommittedFiles(); - - Map props = createProps(); HdfsSinkTask task = new HdfsSinkTask(); task.initialize(context); - task.start(props); + task.start(properties); Map offsets = context.offsets(); assertEquals(offsets.size(), 2); @@ -68,11 +68,11 @@ public void testSinkTaskStart() throws Exception { @Test public void testSinkTaskStartNoCommittedFiles() throws Exception { - Map props = createProps(); + setUp(); HdfsSinkTask task = new HdfsSinkTask(); task.initialize(context); - task.start(props); + task.start(properties); // Even without any files in HDFS, we expect an explicit request to start from the beginning of the topic (which // either exists at offset 0, or offset 0 will be out of range and the consumer will reset to the smallest offset). @@ -88,6 +88,7 @@ public void testSinkTaskStartNoCommittedFiles() throws Exception { @Test public void testSinkTaskStartWithRecovery() throws Exception { + setUp(); Map> tempfiles = new HashMap<>(); List list1 = new ArrayList<>(); list1.add(FileUtils.tempFileName(url, topicsDir, DIRECTORY1, extension)); @@ -119,13 +120,12 @@ public void testSinkTaskStartWithRecovery() throws Exception { fs.createNewFile(new Path(file)); } } - createWALs(tempfiles, committedFiles); - Map props = createProps(); + createWALs(tempfiles, committedFiles); HdfsSinkTask task = new HdfsSinkTask(); task.initialize(context); - task.start(props); + task.start(properties); Map offsets = context.offsets(); assertEquals(2, offsets.size()); @@ -139,14 +139,14 @@ public void testSinkTaskStartWithRecovery() throws Exception { @Test public void testSinkTaskPut() throws Exception { - Map props = createProps(); + setUp(); HdfsSinkTask task = new HdfsSinkTask(); String key = "key"; Schema schema = createSchema(); Struct record = createRecord(schema); Collection sinkRecords = new ArrayList<>(); - for (TopicPartition tp: assignment) { + for (TopicPartition tp : context.assignment()) { for (long offset = 0; offset < 7; offset++) { SinkRecord sinkRecord = new SinkRecord(tp.topic(), tp.partition(), Schema.STRING_SCHEMA, key, schema, record, offset); @@ -154,7 +154,7 @@ public void testSinkTaskPut() throws Exception { } } task.initialize(context); - task.start(props); + task.start(properties); task.put(sinkRecords); task.stop(); @@ -162,7 +162,7 @@ public void testSinkTaskPut() throws Exception { // Last file (offset 6) doesn't satisfy size requirement and gets discarded on close long[] validOffsets = {-1, 2, 5}; - for (TopicPartition tp : assignment) { + for (TopicPartition tp : context.assignment()) { String directory = tp.topic() + "/" + "partition=" + String.valueOf(tp.partition()); for (int j = 1; j < validOffsets.length; ++j) { long startOffset = validOffsets[j - 1] + 1; @@ -170,7 +170,7 @@ public void testSinkTaskPut() throws Exception { Path path = new Path(FileUtils.committedFileName(url, topicsDir, directory, tp, startOffset, endOffset, extension, ZERO_PAD_FMT)); - Collection records = schemaFileReader.readData(conf, path); + Collection records = schemaFileReader.readData(connectorConfig.getHadoopConfiguration(), path); long size = endOffset - startOffset + 1; assertEquals(records.size(), size); for (Object avroRecord : records) { @@ -182,14 +182,14 @@ public void testSinkTaskPut() throws Exception { @Test public void testSinkTaskPutPrimitive() throws Exception { - Map props = createProps(); + setUp(); HdfsSinkTask task = new HdfsSinkTask(); final String key = "key"; final Schema schema = Schema.INT32_SCHEMA; final int record = 12; Collection sinkRecords = new ArrayList<>(); - for (TopicPartition tp: assignment) { + for (TopicPartition tp : context.assignment()) { for (long offset = 0; offset < 7; offset++) { SinkRecord sinkRecord = new SinkRecord(tp.topic(), tp.partition(), Schema.STRING_SCHEMA, key, schema, record, offset); @@ -197,15 +197,14 @@ final int record = 12; } } task.initialize(context); - task.start(props); + task.start(properties); task.put(sinkRecords); task.stop(); - AvroData avroData = task.getAvroData(); // Last file (offset 6) doesn't satisfy size requirement and gets discarded on close long[] validOffsets = {-1, 2, 5}; - for (TopicPartition tp : assignment) { + for (TopicPartition tp : context.assignment()) { String directory = tp.topic() + "/" + "partition=" + String.valueOf(tp.partition()); for (int j = 1; j < validOffsets.length; ++j) { long startOffset = validOffsets[j - 1] + 1; @@ -213,7 +212,7 @@ final int record = 12; Path path = new Path(FileUtils.committedFileName(url, topicsDir, directory, tp, startOffset, endOffset, extension, ZERO_PAD_FMT)); - Collection records = schemaFileReader.readData(conf, path); + Collection records = schemaFileReader.readData(connectorConfig.getHadoopConfiguration(), path); long size = endOffset - startOffset + 1; assertEquals(records.size(), size); for (Object avroRecord : records) { @@ -241,9 +240,14 @@ private void createCommittedFiles() throws IOException { private void createWALs(Map> tempfiles, Map> committedFiles) throws Exception { @SuppressWarnings("unchecked") - Class storageClass = (Class) - Class.forName(connectorConfig.getString(HdfsSinkConnectorConfig.STORAGE_CLASS_CONFIG)); - Storage storage = StorageFactory.createStorage(storageClass, conf, url); + Class storageClass = (Class) + connectorConfig.getClass(StorageCommonConfig.STORAGE_CLASS_CONFIG); + HdfsStorage storage = StorageFactory.createStorage( + storageClass, + HdfsSinkConnectorConfig.class, + connectorConfig, + url + ); for (TopicPartition tp: tempfiles.keySet()) { WAL wal = storage.wal(logsDir, tp); diff --git a/src/test/java/io/confluent/connect/hdfs/HdfsSinkTaskTestWithSecureHDFS.java b/src/test/java/io/confluent/connect/hdfs/HdfsSinkTaskTestWithSecureHDFS.java index dbc6ab679..7d18e88e2 100644 --- a/src/test/java/io/confluent/connect/hdfs/HdfsSinkTaskTestWithSecureHDFS.java +++ b/src/test/java/io/confluent/connect/hdfs/HdfsSinkTaskTestWithSecureHDFS.java @@ -26,6 +26,7 @@ import java.util.Map; import io.confluent.connect.avro.AvroData; +import io.confluent.connect.hdfs.avro.AvroDataFileReader; import io.confluent.connect.hdfs.avro.AvroFileReader; import static org.junit.Assert.assertEquals; @@ -34,18 +35,18 @@ public class HdfsSinkTaskTestWithSecureHDFS extends TestWithSecureMiniDFSCluster private static final String extension = ".avro"; private static final String ZERO_PAD_FMT = "%010d"; - private final SchemaFileReader schemaFileReader = new AvroFileReader(avroData); + private final DataFileReader schemaFileReader = new AvroDataFileReader(); @Test public void testSinkTaskPut() throws Exception { - Map props = createProps(); + setUp(); HdfsSinkTask task = new HdfsSinkTask(); String key = "key"; Schema schema = createSchema(); Struct record = createRecord(schema); Collection sinkRecords = new ArrayList<>(); - for (TopicPartition tp : assignment) { + for (TopicPartition tp : context.assignment()) { for (long offset = 0; offset < 7; offset++) { SinkRecord sinkRecord = new SinkRecord(tp.topic(), tp.partition(), Schema.STRING_SCHEMA, key, schema, record, @@ -54,7 +55,7 @@ public void testSinkTaskPut() throws Exception { } } task.initialize(context); - task.start(props); + task.start(properties); task.put(sinkRecords); task.stop(); @@ -62,7 +63,7 @@ public void testSinkTaskPut() throws Exception { // Last file (offset 6) doesn't satisfy size requirement and gets discarded on close long[] validOffsets = {-1, 2, 5}; - for (TopicPartition tp : assignment) { + for (TopicPartition tp : context.assignment()) { String directory = tp.topic() + "/" + "partition=" + String.valueOf(tp.partition()); for (int j = 1; j < validOffsets.length; ++j) { long startOffset = validOffsets[j - 1] + 1; @@ -70,7 +71,7 @@ public void testSinkTaskPut() throws Exception { Path path = new Path(FileUtils.committedFileName(url, topicsDir, directory, tp, startOffset, endOffset, extension, ZERO_PAD_FMT)); - Collection records = schemaFileReader.readData(conf, path); + Collection records = schemaFileReader.readData(connectorConfig.getHadoopConfiguration(), path); long size = endOffset - startOffset + 1; assertEquals(records.size(), size); for (Object avroRecord : records) { diff --git a/src/test/java/io/confluent/connect/hdfs/OldFormat.java b/src/test/java/io/confluent/connect/hdfs/OldFormat.java new file mode 100644 index 000000000..962e5d44a --- /dev/null +++ b/src/test/java/io/confluent/connect/hdfs/OldFormat.java @@ -0,0 +1,86 @@ +package io.confluent.connect.hdfs; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.Path; +import org.apache.kafka.connect.data.Schema; +import org.apache.kafka.connect.sink.SinkRecord; + +import java.io.IOException; +import java.util.Arrays; +import java.util.Collection; + +import io.confluent.connect.avro.AvroData; +import io.confluent.connect.hdfs.hive.HiveMetaStore; +import io.confluent.connect.hdfs.hive.HiveUtil; +import io.confluent.connect.hdfs.partitioner.Partitioner; + +// Be careful if you need to edit anything in here. The Format, RecordWriterProvider, +// RecordWriter, SchemaFileReader, and HiveUtil classes need to remain compatible. +public class OldFormat implements Format { + + // DO NOT add any other constructors. No-arg constructor must be accessible + public OldFormat() { + + } + + @Override + public RecordWriterProvider getRecordWriterProvider() { + return new RecordWriterProvider() { + @Override + public String getExtension() { + return ".fake"; + } + + @Override + public RecordWriter getRecordWriter( + Configuration conf, + String fileName, + SinkRecord record, + AvroData avroData + ) throws IOException { + return new RecordWriter() { + @Override + public void write(SinkRecord value) throws IOException { + // Intentionally empty + } + + @Override + public void close() throws IOException { + // Intentionally empty + } + }; + } + }; + } + + @Override + public SchemaFileReader getSchemaFileReader(AvroData avroData) { + return new SchemaFileReader() { + @Override + public Schema getSchema(Configuration conf, Path path) throws IOException { + return Schema.INT32_SCHEMA; + } + + @Override + public Collection readData(Configuration conf, Path path) throws IOException { + return Arrays.asList((Object) 1, 2, 3); + } + }; + } + + @Override + public HiveUtil getHiveUtil(HdfsSinkConnectorConfig config, HiveMetaStore hiveMetaStore) { + return new HiveUtil(config, hiveMetaStore) { + @Override + public void createTable(String database, String tableName, Schema schema, + Partitioner partitioner) { + // Intentionally empty + } + + @Override + public void alterSchema(String s, String s1, Schema schema) { + // Intentionally empty + } + }; + } +} diff --git a/src/test/java/io/confluent/connect/hdfs/TestWithMiniDFSCluster.java b/src/test/java/io/confluent/connect/hdfs/TestWithMiniDFSCluster.java index ffa26fb1a..d8996d34b 100644 --- a/src/test/java/io/confluent/connect/hdfs/TestWithMiniDFSCluster.java +++ b/src/test/java/io/confluent/connect/hdfs/TestWithMiniDFSCluster.java @@ -28,19 +28,20 @@ import org.apache.kafka.connect.data.Struct; import org.apache.kafka.connect.sink.SinkRecord; import org.junit.After; -import org.junit.Before; import java.io.FileNotFoundException; import java.io.IOException; import java.util.ArrayList; import java.util.Collection; import java.util.Collections; +import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.Set; import io.confluent.connect.hdfs.filter.TopicPartitionCommittedFileFilter; import io.confluent.connect.hdfs.partitioner.Partitioner; +import io.confluent.connect.storage.common.StorageCommonConfig; import static org.hamcrest.CoreMatchers.is; import static org.junit.Assert.assertEquals; @@ -50,22 +51,31 @@ public class TestWithMiniDFSCluster extends HdfsSinkConnectorTestBase { protected MiniDFSCluster cluster; protected FileSystem fs; - protected SchemaFileReader schemaFileReader; + protected DataFileReader dataFileReader; protected Partitioner partitioner; protected String extension; // The default based on default configuration of 10 protected String zeroPadFormat = "%010d"; + private Map localProps = new HashMap<>(); - @Before + @Override + protected Map createProps() { + Map props = super.createProps(); + url = "hdfs://" + cluster.getNameNode().getClientNamenodeAddress(); + // Override configs using url here + localProps.put(HdfsSinkConnectorConfig.HDFS_URL_CONFIG, url); + localProps.put(StorageCommonConfig.STORE_URL_CONFIG, url); + props.putAll(localProps); + return props; + } + + //@Before public void setUp() throws Exception { - super.setUp(); - conf = new Configuration(); - cluster = createDFSCluster(conf); + Configuration localConf = new Configuration(); + cluster = createDFSCluster(localConf); cluster.waitActive(); - url = "hdfs://" + cluster.getNameNode().getClientNamenodeAddress(); fs = cluster.getFileSystem(); - Map props = createProps(); - connectorConfig = new HdfsSinkConnectorConfig(props); + super.setUp(); } @After @@ -89,13 +99,6 @@ private MiniDFSCluster createDFSCluster(Configuration conf) throws IOException { return cluster; } - @Override - protected Map createProps() { - Map props = super.createProps(); - props.put(HdfsSinkConnectorConfig.HDFS_URL_CONFIG, url); - return props; - } - /** * Return a list of new records starting at zero offset. * @@ -266,7 +269,7 @@ protected void verify(List sinkRecords, long[] validOffsets, Set records = schemaFileReader.readData(conf, path); + Collection records = dataFileReader.readData(connectorConfig.getHadoopConfiguration(), path); long size = endOffset - startOffset + 1; assertEquals(size, records.size()); diff --git a/src/test/java/io/confluent/connect/hdfs/avro/AvroDataFileReader.java b/src/test/java/io/confluent/connect/hdfs/avro/AvroDataFileReader.java new file mode 100644 index 000000000..0af444098 --- /dev/null +++ b/src/test/java/io/confluent/connect/hdfs/avro/AvroDataFileReader.java @@ -0,0 +1,30 @@ +package io.confluent.connect.hdfs.avro; + +import org.apache.avro.file.FileReader; +import org.apache.avro.file.SeekableInput; +import org.apache.avro.generic.GenericDatumReader; +import org.apache.avro.io.DatumReader; +import org.apache.avro.mapred.FsInput; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.Path; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Collection; + +import io.confluent.connect.hdfs.DataFileReader; + +public class AvroDataFileReader implements DataFileReader { + @Override + public Collection readData(Configuration conf, Path path) throws IOException { + ArrayList collection = new ArrayList<>(); + SeekableInput input = new FsInput(path, conf); + DatumReader reader = new GenericDatumReader<>(); + FileReader fileReader = org.apache.avro.file.DataFileReader.openReader(input, reader); + for (Object object: fileReader) { + collection.add(object); + } + fileReader.close(); + return collection; + } +} diff --git a/src/test/java/io/confluent/connect/hdfs/avro/AvroHiveUtilTest.java b/src/test/java/io/confluent/connect/hdfs/avro/AvroHiveUtilTest.java index 74322e188..af1b4275c 100644 --- a/src/test/java/io/confluent/connect/hdfs/avro/AvroHiveUtilTest.java +++ b/src/test/java/io/confluent/connect/hdfs/avro/AvroHiveUtilTest.java @@ -22,11 +22,12 @@ import org.apache.kafka.connect.data.Struct; import org.apache.kafka.connect.sink.SinkRecord; import org.apache.kafka.connect.sink.SinkTaskContext; -import org.junit.Before; import org.junit.Test; import java.util.ArrayList; +import java.util.HashMap; import java.util.List; +import java.util.Map; import io.confluent.connect.avro.AvroData; import io.confluent.connect.hdfs.DataWriter; @@ -38,10 +39,17 @@ import static org.junit.Assert.assertEquals; public class AvroHiveUtilTest extends HiveTestBase { - private HiveUtil hive; + private Map localProps = new HashMap<>(); + + @Override + protected Map createProps() { + Map props = super.createProps(); + props.putAll(localProps); + return props; + } - @Before + //@Before should be omitted in order to be able to add properties per test. public void setUp() throws Exception { super.setUp(); hive = new AvroHiveUtil(connectorConfig, avroData, hiveMetaStore); @@ -49,8 +57,9 @@ public void setUp() throws Exception { @Test public void testCreateTable() throws Exception { + setUp(); prepareData(TOPIC, PARTITION); - Partitioner partitioner = HiveTestUtils.getPartitioner(); + Partitioner partitioner = HiveTestUtils.getPartitioner(parsedConfig); Schema schema = createSchema(); hive.createTable(hiveDatabase, TOPIC, schema, partitioner); @@ -76,7 +85,10 @@ public void testCreateTable() throws Exception { assertEquals(1, partitionCols.size()); assertEquals("partition", partitionCols.get(0).getName()); - String result = HiveTestUtils.runHive(hiveExec, "SELECT * FROM " + TOPIC); + String result = HiveTestUtils.runHive( + hiveExec, + "SELECT * FROM " + hiveMetaStore.tableNameConverter(TOPIC) + ); String[] rows = result.split("\n"); // Only 6 of the 7 records should have been delivered due to flush_size = 3 assertEquals(6, rows.length); @@ -91,8 +103,9 @@ public void testCreateTable() throws Exception { @Test public void testAlterSchema() throws Exception { + setUp(); prepareData(TOPIC, PARTITION); - Partitioner partitioner = HiveTestUtils.getPartitioner(); + Partitioner partitioner = HiveTestUtils.getPartitioner(parsedConfig); Schema schema = createSchema(); hive.createTable(hiveDatabase, TOPIC, schema, partitioner); @@ -119,7 +132,10 @@ public void testAlterSchema() throws Exception { hive.alterSchema(hiveDatabase, TOPIC, newSchema); - String result = HiveTestUtils.runHive(hiveExec, "SELECT * from " + TOPIC); + String result = HiveTestUtils.runHive( + hiveExec, + "SELECT * from " + hiveMetaStore.tableNameConverter(TOPIC) + ); String[] rows = result.split("\n"); // Only 6 of the 7 records should have been delivered due to flush_size = 3 assertEquals(6, rows.length); @@ -140,7 +156,7 @@ private void prepareData(String topic, int partition) throws Exception { List sinkRecords = createSinkRecords(7); hdfsWriter.write(sinkRecords); - hdfsWriter.close(assignment); + hdfsWriter.close(); hdfsWriter.stop(); } diff --git a/src/test/java/io/confluent/connect/hdfs/avro/DataWriterAvroTest.java b/src/test/java/io/confluent/connect/hdfs/avro/DataWriterAvroTest.java index ca80a7a8f..ceaf4b37c 100644 --- a/src/test/java/io/confluent/connect/hdfs/avro/DataWriterAvroTest.java +++ b/src/test/java/io/confluent/connect/hdfs/avro/DataWriterAvroTest.java @@ -31,9 +31,9 @@ import io.confluent.connect.hdfs.FileUtils; import io.confluent.connect.hdfs.HdfsSinkConnectorConfig; import io.confluent.connect.hdfs.TestWithMiniDFSCluster; -import io.confluent.connect.hdfs.storage.Storage; -import io.confluent.connect.hdfs.storage.StorageFactory; +import io.confluent.connect.hdfs.storage.HdfsStorage; import io.confluent.connect.hdfs.wal.WAL; +import io.confluent.connect.storage.hive.HiveConfig; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertNotNull; @@ -45,7 +45,7 @@ public class DataWriterAvroTest extends TestWithMiniDFSCluster { @Before public void setUp() throws Exception { super.setUp(); - schemaFileReader = new AvroFileReader(avroData); + dataFileReader = new AvroDataFileReader(); extension = ".avro"; } @@ -58,7 +58,7 @@ public void testWriteRecord() throws Exception { List sinkRecords = createSinkRecords(7); hdfsWriter.write(sinkRecords); - hdfsWriter.close(assignment); + hdfsWriter.close(); hdfsWriter.stop(); // Last file (offset 6) doesn't satisfy size requirement and gets discarded on close @@ -70,10 +70,7 @@ public void testWriteRecord() throws Exception { public void testRecovery() throws Exception { fs.delete(new Path(FileUtils.directoryName(url, topicsDir, TOPIC_PARTITION)), true); - @SuppressWarnings("unchecked") - Class storageClass = (Class) - Class.forName(connectorConfig.getString(HdfsSinkConnectorConfig.STORAGE_CLASS_CONFIG)); - Storage storage = StorageFactory.createStorage(storageClass, conf, url); + HdfsStorage storage = new HdfsStorage(connectorConfig, url); DataWriter hdfsWriter = new DataWriter(connectorConfig, context, avroData); partitioner = hdfsWriter.getPartitioner(); @@ -101,7 +98,7 @@ public void testRecovery() throws Exception { List sinkRecords = createSinkRecords(3, 50); hdfsWriter.write(sinkRecords); - hdfsWriter.close(assignment); + hdfsWriter.close(); hdfsWriter.stop(); long[] validOffsets = {0, 10, 20, 30, 40, 50, 53}; @@ -113,19 +110,19 @@ public void testWriteRecordMultiplePartitions() throws Exception { DataWriter hdfsWriter = new DataWriter(connectorConfig, context, avroData); partitioner = hdfsWriter.getPartitioner(); - for (TopicPartition tp: assignment) { + for (TopicPartition tp : context.assignment()) { hdfsWriter.recover(tp); } - List sinkRecords = createSinkRecords(7, 0, assignment); + List sinkRecords = createSinkRecords(7, 0, context.assignment()); hdfsWriter.write(sinkRecords); - hdfsWriter.close(assignment); + hdfsWriter.close(); hdfsWriter.stop(); // Last file (offset 6) doesn't satisfy size requirement and gets discarded on close long[] validOffsets = {0, 3, 6}; - verify(sinkRecords, validOffsets, assignment); + verify(sinkRecords, validOffsets, context.assignment()); } @Test @@ -133,18 +130,19 @@ public void testWriteInterleavedRecordsInMultiplePartitions() throws Exception { DataWriter hdfsWriter = new DataWriter(connectorConfig, context, avroData); partitioner = hdfsWriter.getPartitioner(); - for (TopicPartition tp: assignment) { + for (TopicPartition tp : context.assignment()) { hdfsWriter.recover(tp); } - List sinkRecords = createSinkRecordsInterleaved(7 * assignment.size(), 0, assignment); + List sinkRecords = createSinkRecordsInterleaved(7 * context.assignment().size(), 0, + context.assignment()); hdfsWriter.write(sinkRecords); - hdfsWriter.close(assignment); + hdfsWriter.close(); hdfsWriter.stop(); long[] validOffsets = {0, 3, 6}; - verify(sinkRecords, validOffsets, assignment); + verify(sinkRecords, validOffsets, context.assignment()); } @Test @@ -152,14 +150,15 @@ public void testWriteInterleavedRecordsInMultiplePartitionsNonZeroInitialOffset( DataWriter hdfsWriter = new DataWriter(connectorConfig, context, avroData); partitioner = hdfsWriter.getPartitioner(); - List sinkRecords = createSinkRecordsInterleaved(7 * assignment.size(), 9, assignment); + List sinkRecords = createSinkRecordsInterleaved(7 * context.assignment().size(), 9, + context.assignment()); hdfsWriter.write(sinkRecords); - hdfsWriter.close(assignment); + hdfsWriter.close(); hdfsWriter.stop(); long[] validOffsets = {9, 12, 15}; - verify(sinkRecords, validOffsets, assignment); + verify(sinkRecords, validOffsets, context.assignment()); } @Test @@ -188,7 +187,7 @@ public void testGetPreviousOffsets() throws Exception { long previousOffset = committedOffsets.get(TOPIC_PARTITION); assertEquals(previousOffset, 6L); - hdfsWriter.close(assignment); + hdfsWriter.close(); hdfsWriter.stop(); } @@ -201,7 +200,7 @@ public void testWriteRecordNonZeroInitialOffset() throws Exception { List sinkRecords = createSinkRecords(7, 3); hdfsWriter.write(sinkRecords); - hdfsWriter.close(assignment); + hdfsWriter.close(); hdfsWriter.stop(); // Last file (offset 9) doesn't satisfy size requirement and gets discarded on close @@ -214,23 +213,23 @@ public void testRebalance() throws Exception { DataWriter hdfsWriter = new DataWriter(connectorConfig, context, avroData); partitioner = hdfsWriter.getPartitioner(); - // Initial assignment is {TP1, TP2} - for (TopicPartition tp: assignment) { + Set originalAssignment = new HashSet<>(context.assignment()); + // Starts with TOPIC_PARTITION and TOPIC_PARTITION2 + for (TopicPartition tp : originalAssignment) { hdfsWriter.recover(tp); } - List sinkRecords = createSinkRecords(7, 0, assignment); - hdfsWriter.write(sinkRecords); - - Set oldAssignment = new HashSet<>(assignment); + Set nextAssignment = new HashSet<>(); + nextAssignment.add(TOPIC_PARTITION); + nextAssignment.add(TOPIC_PARTITION3); - Set newAssignment = new HashSet<>(); - newAssignment.add(TOPIC_PARTITION); - newAssignment.add(TOPIC_PARTITION3); + List sinkRecords = createSinkRecords(7, 0, originalAssignment); - hdfsWriter.close(assignment); - assignment = newAssignment; - hdfsWriter.open(newAssignment); + hdfsWriter.write(sinkRecords); + hdfsWriter.close(); + // Set the new assignment to the context + context.setAssignment(nextAssignment); + hdfsWriter.open(nextAssignment); assertEquals(null, hdfsWriter.getBucketWriter(TOPIC_PARTITION2)); assertNotNull(hdfsWriter.getBucketWriter(TOPIC_PARTITION)); @@ -241,10 +240,10 @@ public void testRebalance() throws Exception { verify(sinkRecords, validOffsetsTopicPartition2, Collections.singleton(TOPIC_PARTITION2), true); // Message offsets start at 6 because we discarded the in-progress temp file on re-balance - sinkRecords = createSinkRecords(3, 6, assignment); + sinkRecords = createSinkRecords(3, 6, context.assignment()); hdfsWriter.write(sinkRecords); - hdfsWriter.close(newAssignment); + hdfsWriter.close(); hdfsWriter.stop(); // Last file (offset 9) doesn't satisfy size requirement and gets discarded on close @@ -253,15 +252,13 @@ public void testRebalance() throws Exception { long[] validOffsetsTopicPartition3 = {6, 9}; verify(sinkRecords, validOffsetsTopicPartition3, Collections.singleton(TOPIC_PARTITION3), true); - - assignment = oldAssignment; } @Test public void testProjectBackWard() throws Exception { Map props = createProps(); props.put(HdfsSinkConnectorConfig.FLUSH_SIZE_CONFIG, "2"); - props.put(HdfsSinkConnectorConfig.SCHEMA_COMPATIBILITY_CONFIG, "BACKWARD"); + props.put(HiveConfig.SCHEMA_COMPATIBILITY_CONFIG, "BACKWARD"); HdfsSinkConnectorConfig connectorConfig = new HdfsSinkConnectorConfig(props); DataWriter hdfsWriter = new DataWriter(connectorConfig, context, avroData); @@ -271,7 +268,7 @@ public void testProjectBackWard() throws Exception { List sinkRecords = createSinkRecordsWithAlternatingSchemas(7, 0); hdfsWriter.write(sinkRecords); - hdfsWriter.close(assignment); + hdfsWriter.close(); hdfsWriter.stop(); long[] validOffsets = {0, 1, 3, 5, 7}; verify(sinkRecords, validOffsets); @@ -290,7 +287,7 @@ public void testProjectNone() throws Exception { List sinkRecords = createSinkRecordsWithAlternatingSchemas(7, 0); hdfsWriter.write(sinkRecords); - hdfsWriter.close(assignment); + hdfsWriter.close(); hdfsWriter.stop(); long[] validOffsets = {0, 1, 2, 3, 4, 5, 6}; @@ -301,7 +298,7 @@ public void testProjectNone() throws Exception { public void testProjectForward() throws Exception { Map props = createProps(); props.put(HdfsSinkConnectorConfig.FLUSH_SIZE_CONFIG, "2"); - props.put(HdfsSinkConnectorConfig.SCHEMA_COMPATIBILITY_CONFIG, "FORWARD"); + props.put(HiveConfig.SCHEMA_COMPATIBILITY_CONFIG, "FORWARD"); HdfsSinkConnectorConfig connectorConfig = new HdfsSinkConnectorConfig(props); DataWriter hdfsWriter = new DataWriter(connectorConfig, context, avroData); @@ -312,7 +309,7 @@ public void testProjectForward() throws Exception { List sinkRecords = createSinkRecordsWithAlternatingSchemas(8, 0).subList(1, 8); hdfsWriter.write(sinkRecords); - hdfsWriter.close(assignment); + hdfsWriter.close(); hdfsWriter.stop(); long[] validOffsets = {1, 2, 4, 6, 8}; @@ -322,7 +319,7 @@ public void testProjectForward() throws Exception { @Test public void testProjectNoVersion() throws Exception { Map props = createProps(); - props.put(HdfsSinkConnectorConfig.SCHEMA_COMPATIBILITY_CONFIG, "BACKWARD"); + props.put(HiveConfig.SCHEMA_COMPATIBILITY_CONFIG, "BACKWARD"); HdfsSinkConnectorConfig connectorConfig = new HdfsSinkConnectorConfig(props); DataWriter hdfsWriter = new DataWriter(connectorConfig, context, avroData); @@ -338,7 +335,7 @@ public void testProjectNoVersion() throws Exception { } catch (RuntimeException e) { // expected } finally { - hdfsWriter.close(assignment); + hdfsWriter.close(); hdfsWriter.stop(); long[] validOffsets = {}; verify(Collections.emptyList(), validOffsets); @@ -359,8 +356,7 @@ public void testFlushPartialFile() throws Exception { props.put(HdfsSinkConnectorConfig.FLUSH_SIZE_CONFIG, FLUSH_SIZE_CONFIG); props.put(HdfsSinkConnectorConfig.ROTATE_INTERVAL_MS_CONFIG, ROTATE_INTERVAL_MS_CONFIG); HdfsSinkConnectorConfig connectorConfig = new HdfsSinkConnectorConfig(props); - assignment = new HashSet<>(); - assignment.add(TOPIC_PARTITION); + context.assignment().add(TOPIC_PARTITION); DataWriter hdfsWriter = new DataWriter(connectorConfig, context, avroData); partitioner = hdfsWriter.getPartitioner(); @@ -382,7 +378,7 @@ public void testFlushPartialFile() throws Exception { long previousOffset = committedOffsets.get(TOPIC_PARTITION); assertEquals(NUMBER_OF_RECORDS, previousOffset); - hdfsWriter.close(assignment); + hdfsWriter.close(); hdfsWriter.stop(); } diff --git a/src/test/java/io/confluent/connect/hdfs/avro/HiveIntegrationAvroTest.java b/src/test/java/io/confluent/connect/hdfs/avro/HiveIntegrationAvroTest.java index bd6a9832b..2ba50294f 100644 --- a/src/test/java/io/confluent/connect/hdfs/avro/HiveIntegrationAvroTest.java +++ b/src/test/java/io/confluent/connect/hdfs/avro/HiveIntegrationAvroTest.java @@ -26,6 +26,7 @@ import java.util.ArrayList; import java.util.Collection; +import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.concurrent.TimeUnit; @@ -38,20 +39,30 @@ import io.confluent.connect.hdfs.partitioner.DailyPartitioner; import io.confluent.connect.hdfs.partitioner.FieldPartitioner; import io.confluent.connect.hdfs.partitioner.TimeUtils; +import io.confluent.connect.storage.hive.HiveConfig; +import io.confluent.connect.storage.partitioner.PartitionerConfig; import static org.junit.Assert.assertEquals; public class HiveIntegrationAvroTest extends HiveTestBase { + private Map localProps = new HashMap<>(); @Override protected Map createProps() { Map props = super.createProps(); props.put(HdfsSinkConnectorConfig.SHUTDOWN_TIMEOUT_CONFIG, "10000"); + props.putAll(localProps); return props; } + //@Before should be omitted in order to be able to add properties per test. + public void setUp() throws Exception { + super.setUp(); + } + @Test public void testSyncWithHiveAvro() throws Exception { + setUp(); DataWriter hdfsWriter = new DataWriter(connectorConfig, context, avroData); hdfsWriter.recover(TOPIC_PARTITION); @@ -67,11 +78,11 @@ public void testSyncWithHiveAvro() throws Exception { } hdfsWriter.write(sinkRecords); - hdfsWriter.close(assignment); + hdfsWriter.close(); hdfsWriter.stop(); Map props = createProps(); - props.put(HdfsSinkConnectorConfig.HIVE_INTEGRATION_CONFIG, "true"); + props.put(HiveConfig.HIVE_INTEGRATION_CONFIG, "true"); HdfsSinkConnectorConfig config = new HdfsSinkConnectorConfig(props); hdfsWriter = new DataWriter(config, context, avroData); @@ -97,17 +108,15 @@ public void testSyncWithHiveAvro() throws Exception { assertEquals(expectedPartitions, partitions); - hdfsWriter.close(assignment); + hdfsWriter.close(); hdfsWriter.stop(); } @Test public void testHiveIntegrationAvro() throws Exception { - Map props = createProps(); - props.put(HdfsSinkConnectorConfig.HIVE_INTEGRATION_CONFIG, "true"); - HdfsSinkConnectorConfig config = new HdfsSinkConnectorConfig(props); - - DataWriter hdfsWriter = new DataWriter(config, context, avroData); + localProps.put(HiveConfig.HIVE_INTEGRATION_CONFIG, "true"); + setUp(); + DataWriter hdfsWriter = new DataWriter(connectorConfig, context, avroData); hdfsWriter.recover(TOPIC_PARTITION); String key = "key"; @@ -123,7 +132,7 @@ public void testHiveIntegrationAvro() throws Exception { } hdfsWriter.write(sinkRecords); - hdfsWriter.close(assignment); + hdfsWriter.close(); hdfsWriter.stop(); Table table = hiveMetaStore.getTable(hiveDatabase, TOPIC); @@ -149,13 +158,11 @@ public void testHiveIntegrationAvro() throws Exception { @Test public void testHiveIntegrationTopicWithDotsAvro() throws Exception { - assignment.add(TOPIC_WITH_DOTS_PARTITION); + localProps.put(HiveConfig.HIVE_INTEGRATION_CONFIG, "true"); + setUp(); + context.assignment().add(TOPIC_WITH_DOTS_PARTITION); - Map props = createProps(); - props.put(HdfsSinkConnectorConfig.HIVE_INTEGRATION_CONFIG, "true"); - HdfsSinkConnectorConfig config = new HdfsSinkConnectorConfig(props); - - DataWriter hdfsWriter = new DataWriter(config, context, avroData); + DataWriter hdfsWriter = new DataWriter(connectorConfig, context, avroData); hdfsWriter.recover(TOPIC_WITH_DOTS_PARTITION); String key = "key"; @@ -171,7 +178,7 @@ public void testHiveIntegrationTopicWithDotsAvro() throws Exception { } hdfsWriter.write(sinkRecords); - hdfsWriter.close(assignment); + hdfsWriter.close(); hdfsWriter.stop(); Table table = hiveMetaStore.getTable(hiveDatabase, TOPIC_WITH_DOTS); @@ -197,13 +204,12 @@ public void testHiveIntegrationTopicWithDotsAvro() throws Exception { @Test public void testHiveIntegrationFieldPartitionerAvro() throws Exception { - Map props = createProps(); - props.put(HdfsSinkConnectorConfig.HIVE_INTEGRATION_CONFIG, "true"); - props.put(HdfsSinkConnectorConfig.PARTITIONER_CLASS_CONFIG, FieldPartitioner.class.getName()); - props.put(HdfsSinkConnectorConfig.PARTITION_FIELD_NAME_CONFIG, "int"); + localProps.put(HiveConfig.HIVE_INTEGRATION_CONFIG, "true"); + localProps.put(PartitionerConfig.PARTITIONER_CLASS_CONFIG, FieldPartitioner.class.getName()); + localProps.put(PartitionerConfig.PARTITION_FIELD_NAME_CONFIG, "int"); + setUp(); - HdfsSinkConnectorConfig config = new HdfsSinkConnectorConfig(props); - DataWriter hdfsWriter = new DataWriter(config, context, avroData); + DataWriter hdfsWriter = new DataWriter(connectorConfig, context, avroData); String key = "key"; Schema schema = createSchema(); @@ -221,7 +227,7 @@ public void testHiveIntegrationFieldPartitionerAvro() throws Exception { } hdfsWriter.write(sinkRecords); - hdfsWriter.close(assignment); + hdfsWriter.close(); hdfsWriter.stop(); Table table = hiveMetaStore.getTable(hiveDatabase, TOPIC); @@ -238,7 +244,9 @@ public void testHiveIntegrationFieldPartitionerAvro() throws Exception { assertEquals(expectedColumnNames, actualColumnNames); - String partitionFieldName = config.getString(HdfsSinkConnectorConfig.PARTITION_FIELD_NAME_CONFIG); + String partitionFieldName = connectorConfig.getString( + PartitionerConfig.PARTITION_FIELD_NAME_CONFIG + ); String directory1 = TOPIC + "/" + partitionFieldName + "=" + String.valueOf(16); String directory2 = TOPIC + "/" + partitionFieldName + "=" + String.valueOf(17); String directory3 = TOPIC + "/" + partitionFieldName + "=" + String.valueOf(18); @@ -259,7 +267,10 @@ public void testHiveIntegrationFieldPartitionerAvro() throws Exception { expectedResult.add(part); } } - String result = HiveTestUtils.runHive(hiveExec, "SELECT * FROM " + TOPIC); + String result = HiveTestUtils.runHive( + hiveExec, + "SELECT * FROM " + hiveMetaStore.tableNameConverter(TOPIC) + ); String[] rows = result.split("\n"); assertEquals(9, rows.length); for (int i = 0; i < rows.length; ++i) { @@ -272,14 +283,10 @@ public void testHiveIntegrationFieldPartitionerAvro() throws Exception { @Test public void testHiveIntegrationTimeBasedPartitionerAvro() throws Exception { - Map props = createProps(); - props.put(HdfsSinkConnectorConfig.HIVE_INTEGRATION_CONFIG, "true"); - props.put(HdfsSinkConnectorConfig.PARTITIONER_CLASS_CONFIG, DailyPartitioner.class.getName()); - props.put(HdfsSinkConnectorConfig.TIMEZONE_CONFIG, "America/Los_Angeles"); - props.put(HdfsSinkConnectorConfig.LOCALE_CONFIG, "en"); - - HdfsSinkConnectorConfig config = new HdfsSinkConnectorConfig(props); - DataWriter hdfsWriter = new DataWriter(config, context, avroData); + localProps.put(HiveConfig.HIVE_INTEGRATION_CONFIG, "true"); + localProps.put(PartitionerConfig.PARTITIONER_CLASS_CONFIG, DailyPartitioner.class.getName()); + setUp(); + DataWriter hdfsWriter = new DataWriter(connectorConfig, context, avroData); String key = "key"; Schema schema = createSchema(); @@ -289,15 +296,22 @@ public void testHiveIntegrationTimeBasedPartitionerAvro() throws Exception { long offset = 0; for (Struct record : records) { for (long count = 0; count < 3; count++) { - SinkRecord sinkRecord = new SinkRecord(TOPIC, PARTITION, Schema.STRING_SCHEMA, key, schema, record, - offset + count); + SinkRecord sinkRecord = new SinkRecord( + TOPIC, + PARTITION, + Schema.STRING_SCHEMA, + key, + schema, + record, + offset + count + ); sinkRecords.add(sinkRecord); } offset = offset + 3; } hdfsWriter.write(sinkRecords); - hdfsWriter.close(assignment); + hdfsWriter.close(); hdfsWriter.stop(); Table table = hiveMetaStore.getTable(hiveDatabase, TOPIC); @@ -339,7 +353,10 @@ public void testHiveIntegrationTimeBasedPartitionerAvro() throws Exception { } } - String result = HiveTestUtils.runHive(hiveExec, "SELECT * FROM " + TOPIC); + String result = HiveTestUtils.runHive( + hiveExec, + "SELECT * FROM " + hiveMetaStore.tableNameConverter(TOPIC) + ); String[] rows = result.split("\n"); assertEquals(9, rows.length); for (int i = 0; i < rows.length; ++i) { diff --git a/src/test/java/io/confluent/connect/hdfs/avro/TopicPartitionWriterTest.java b/src/test/java/io/confluent/connect/hdfs/avro/TopicPartitionWriterTest.java index 8cf84ea51..88d58f6b7 100644 --- a/src/test/java/io/confluent/connect/hdfs/avro/TopicPartitionWriterTest.java +++ b/src/test/java/io/confluent/connect/hdfs/avro/TopicPartitionWriterTest.java @@ -19,19 +19,16 @@ import org.apache.kafka.connect.data.Schema; import org.apache.kafka.connect.data.Struct; import org.apache.kafka.connect.sink.SinkRecord; -import org.junit.Before; import org.junit.Test; import java.io.IOException; import java.util.ArrayList; import java.util.Collection; -import java.util.Collections; import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Set; -import java.util.concurrent.TimeUnit; import io.confluent.connect.hdfs.FileUtils; import io.confluent.connect.hdfs.Format; @@ -45,41 +42,73 @@ import io.confluent.connect.hdfs.partitioner.Partitioner; import io.confluent.connect.hdfs.partitioner.TimeBasedPartitioner; import io.confluent.connect.hdfs.partitioner.TimeUtils; -import io.confluent.connect.hdfs.storage.Storage; -import io.confluent.connect.hdfs.storage.StorageFactory; +import io.confluent.connect.hdfs.storage.HdfsStorage; +import io.confluent.connect.storage.StorageFactory; +import io.confluent.connect.storage.common.StorageCommonConfig; +import io.confluent.connect.storage.hive.schema.TimeBasedSchemaGenerator; +import io.confluent.connect.storage.partitioner.PartitionerConfig; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertTrue; public class TopicPartitionWriterTest extends TestWithMiniDFSCluster { - private RecordWriterProvider writerProvider; - private Storage storage; + private RecordWriterProvider writerProvider = null; + private io.confluent.connect.storage.format.RecordWriterProvider + newWriterProvider; + private HdfsStorage storage; + private Map localProps = new HashMap<>(); + + @Override + protected Map createProps() { + Map props = super.createProps(); + props.putAll(localProps); + return props; + } - @Before + //@Before should be omitted in order to be able to add properties per test. public void setUp() throws Exception { super.setUp(); @SuppressWarnings("unchecked") - Format format = ((Class) Class.forName(connectorConfig.getString(HdfsSinkConnectorConfig.FORMAT_CLASS_CONFIG))).newInstance(); - writerProvider = format.getRecordWriterProvider(); - schemaFileReader = format.getSchemaFileReader(avroData); - extension = writerProvider.getExtension(); + Class storageClass = (Class) + connectorConfig.getClass(StorageCommonConfig.STORAGE_CLASS_CONFIG); + storage = StorageFactory.createStorage( + storageClass, + HdfsSinkConnectorConfig.class, + connectorConfig, + url + ); @SuppressWarnings("unchecked") - Class storageClass = (Class) Class - .forName(connectorConfig.getString(HdfsSinkConnectorConfig.STORAGE_CLASS_CONFIG)); - storage = StorageFactory.createStorage(storageClass, conf, url); + Class formatClass + = (Class) connectorConfig.getClass( + HdfsSinkConnectorConfig.FORMAT_CLASS_CONFIG + ); + io.confluent.connect.storage.format.Format format + = formatClass.getConstructor(HdfsStorage.class).newInstance(storage); + writerProvider = null; + newWriterProvider = format.getRecordWriterProvider(); + dataFileReader = new AvroDataFileReader(); + extension = newWriterProvider.getExtension(); createTopicDir(url, topicsDir, TOPIC); createLogsDir(url, logsDir); } @Test public void testWriteRecordDefaultWithPadding() throws Exception { + localProps.put(HdfsSinkConnectorConfig.FILENAME_OFFSET_ZERO_PAD_WIDTH_CONFIG, "2"); + setUp(); Partitioner partitioner = new DefaultPartitioner(); - partitioner.configure(Collections.emptyMap()); - connectorProps.put(HdfsSinkConnectorConfig.FILENAME_OFFSET_ZERO_PAD_WIDTH_CONFIG, "2"); - configureConnector(); + partitioner.configure(parsedConfig); TopicPartitionWriter topicPartitionWriter = new TopicPartitionWriter( - TOPIC_PARTITION, storage, writerProvider, partitioner, connectorConfig, context, avroData); + TOPIC_PARTITION, + storage, + writerProvider, + newWriterProvider, + partitioner, + connectorConfig, + context, + avroData + ); Schema schema = createSchema(); List records = createRecordBatches(schema, 3, 3); @@ -106,17 +135,26 @@ public void testWriteRecordDefaultWithPadding() throws Exception { verify(expectedFiles, expectedBatchSize, records, schema); } - @Test public void testWriteRecordFieldPartitioner() throws Exception { - Map config = createConfig(); + setUp(); Partitioner partitioner = new FieldPartitioner(); - partitioner.configure(config); + partitioner.configure(parsedConfig); - String partitionField = (String) config.get(HdfsSinkConnectorConfig.PARTITION_FIELD_NAME_CONFIG); + String partitionField = (String) parsedConfig.get( + PartitionerConfig.PARTITION_FIELD_NAME_CONFIG + ); TopicPartitionWriter topicPartitionWriter = new TopicPartitionWriter( - TOPIC_PARTITION, storage, writerProvider, partitioner, connectorConfig, context, avroData); + TOPIC_PARTITION, + storage, + writerProvider, + newWriterProvider, + partitioner, + connectorConfig, + context, + avroData + ); Schema schema = createSchema(); List records = new ArrayList<>(); @@ -153,12 +191,20 @@ public void testWriteRecordFieldPartitioner() throws Exception { @Test public void testWriteRecordTimeBasedPartition() throws Exception { - Map config = createConfig(); + setUp(); Partitioner partitioner = new TimeBasedPartitioner(); - partitioner.configure(config); + partitioner.configure(parsedConfig); TopicPartitionWriter topicPartitionWriter = new TopicPartitionWriter( - TOPIC_PARTITION, storage, writerProvider, partitioner, connectorConfig, context, avroData); + TOPIC_PARTITION, + storage, + writerProvider, + newWriterProvider, + partitioner, + connectorConfig, + context, + avroData + ); Schema schema = createSchema(); List records = createRecordBatches(schema, 3, 3); @@ -174,9 +220,11 @@ public void testWriteRecordTimeBasedPartition() throws Exception { topicPartitionWriter.write(); topicPartitionWriter.close(); - long partitionDurationMs = (Long) config.get(HdfsSinkConnectorConfig.PARTITION_DURATION_MS_CONFIG); - String pathFormat = (String) config.get(HdfsSinkConnectorConfig.PATH_FORMAT_CONFIG); - String timeZoneString = (String) config.get(HdfsSinkConnectorConfig.TIMEZONE_CONFIG); + long partitionDurationMs = (Long) parsedConfig.get( + PartitionerConfig.PARTITION_DURATION_MS_CONFIG + ); + String pathFormat = (String) parsedConfig.get(PartitionerConfig.PATH_FORMAT_CONFIG); + String timeZoneString = (String) parsedConfig.get(PartitionerConfig.TIMEZONE_CONFIG); long timestamp = System.currentTimeMillis(); String encodedPartition = TimeUtils.encodeTimestamp(partitionDurationMs, pathFormat, timeZoneString, timestamp); @@ -192,16 +240,6 @@ public void testWriteRecordTimeBasedPartition() throws Exception { verify(expectedFiles, expectedBatchSize, records, schema); } - private Map createConfig() { - Map config = new HashMap<>(); - config.put(HdfsSinkConnectorConfig.PARTITION_FIELD_NAME_CONFIG, "int"); - config.put(HdfsSinkConnectorConfig.PARTITION_DURATION_MS_CONFIG, TimeUnit.HOURS.toMillis(1)); - config.put(HdfsSinkConnectorConfig.PATH_FORMAT_CONFIG, "'year'=YYYY/'month'=MM/'day'=dd/'hour'=HH/"); - config.put(HdfsSinkConnectorConfig.LOCALE_CONFIG, "en"); - config.put(HdfsSinkConnectorConfig.TIMEZONE_CONFIG, "America/Los_Angeles"); - return config; - } - private void createTopicDir(String url, String topicsDir, String topic) throws IOException { Path path = new Path(FileUtils.topicDirectory(url, topicsDir, topic)); if (!fs.exists(path)) { @@ -224,7 +262,7 @@ private void verify(Set expectedFiles, int expectedSize, List reco for (FileStatus status : statuses) { Path filePath = status.getPath(); assertTrue(expectedFiles.contains(status.getPath())); - Collection avroRecords = schemaFileReader.readData(conf, filePath); + Collection avroRecords = dataFileReader.readData(connectorConfig.getHadoopConfiguration(), filePath); assertEquals(expectedSize, avroRecords.size()); for (Object avroRecord : avroRecords) { assertEquals(avroData.fromConnectData(schema, records.get(index++)), avroRecord); diff --git a/src/test/java/io/confluent/connect/hdfs/hive/HiveExec.java b/src/test/java/io/confluent/connect/hdfs/hive/HiveExec.java index 7539bed44..0b7640608 100644 --- a/src/test/java/io/confluent/connect/hdfs/hive/HiveExec.java +++ b/src/test/java/io/confluent/connect/hdfs/hive/HiveExec.java @@ -30,6 +30,7 @@ import java.util.List; import io.confluent.connect.hdfs.HdfsSinkConnectorConfig; +import io.confluent.connect.storage.hive.HiveConfig; public class HiveExec { @@ -44,7 +45,7 @@ public class HiveExec { */ public HiveExec(HdfsSinkConnectorConfig config) { hiveConf = new HiveConf(); - String hiveConfDir = config.getString(HdfsSinkConnectorConfig.HIVE_CONF_DIR_CONFIG); + String hiveConfDir = config.getString(HiveConfig.HIVE_CONF_DIR_CONFIG); hiveConf.addResource(new Path(hiveConfDir, "hive-site.xml")); SessionState.start(new CliSessionState(hiveConf)); cliDriver = new CliDriver(); @@ -70,7 +71,7 @@ public void executeQuery(String query) throws IOException { } - private String[] getHiveArgs(String... args) throws IOException { + private String[] getHiveArgs(String... args) { List newArgs = new LinkedList<>(); newArgs.addAll(Arrays.asList(args)); if (hiveConf.getBoolean(HIVE_SASL_ENABLED, false)) { diff --git a/src/test/java/io/confluent/connect/hdfs/hive/HiveSchemaConverterTest.java b/src/test/java/io/confluent/connect/hdfs/hive/HiveSchemaConverterTest.java deleted file mode 100644 index c4dfe7525..000000000 --- a/src/test/java/io/confluent/connect/hdfs/hive/HiveSchemaConverterTest.java +++ /dev/null @@ -1,102 +0,0 @@ -/** - * Copyright 2015 Confluent Inc. - * - * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except - * in compliance with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software distributed under the License - * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express - * or implied. See the License for the specific language governing permissions and limitations under - * the License. - **/ - -package io.confluent.connect.hdfs.hive; - -import org.apache.hadoop.hive.metastore.api.FieldSchema; -import org.apache.hadoop.hive.serde2.typeinfo.StructTypeInfo; -import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; -import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory; -import org.apache.kafka.connect.data.Schema; -import org.apache.kafka.connect.data.SchemaBuilder; -import org.junit.Test; - -import java.util.ArrayList; -import java.util.List; - -import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertTrue; - -public class HiveSchemaConverterTest { - - private static final Schema SIMPLE_STRUCT = SchemaBuilder.struct().name("SimpleStruct") - .field("id", Schema.INT32_SCHEMA) - .field("name", Schema.STRING_SCHEMA) - .build(); - - private static final Schema COMPLEX_STRUCT = SchemaBuilder.struct().name("ComplexStruct") - .field("groupName", Schema.STRING_SCHEMA) - .field("simpleStructs", SchemaBuilder.array(SIMPLE_STRUCT).build()) - .build(); - - - @Test - public void testConvertSimpleStruct() { - TypeInfo type = HiveSchemaConverter.convert(SIMPLE_STRUCT); - assertTrue(type instanceof StructTypeInfo); - - List expectedFieldNames = new ArrayList<>(); - expectedFieldNames.add("id"); - expectedFieldNames.add("name"); - - assertEquals(expectedFieldNames, ((StructTypeInfo) type).getAllStructFieldNames()); - } - - @Test - public void testConvertComplexStruct() { - List fields = HiveSchemaConverter.convertSchema(COMPLEX_STRUCT); - List expectedFieldNames = new ArrayList<>(); - expectedFieldNames.add("groupName"); - expectedFieldNames.add("simpleStructs"); - - List actualFieldNames = new ArrayList<>(); - for (FieldSchema fieldSchema: fields) { - actualFieldNames.add(fieldSchema.getName()); - } - assertEquals(expectedFieldNames, actualFieldNames); - - List expectedTypes = new ArrayList<>(); - List typeInfos = new ArrayList<>(); - typeInfos.add(TypeInfoFactory.intTypeInfo); - typeInfos.add(TypeInfoFactory.stringTypeInfo); - - expectedTypes.add(TypeInfoFactory.stringTypeInfo.toString()); - - List expectedInnerFieldNames = new ArrayList<>(); - expectedInnerFieldNames.add("id"); - expectedInnerFieldNames.add("name"); - TypeInfo structType = TypeInfoFactory.getStructTypeInfo(expectedInnerFieldNames, typeInfos); - - expectedTypes.add(TypeInfoFactory.getListTypeInfo(structType).toString()); - - List actualTypes = new ArrayList<>(); - for (FieldSchema fieldSchema: fields) { - actualTypes.add(fieldSchema.getType()); - } - assertEquals(expectedTypes, actualTypes); - } - - @Test - public void testConvertArray() { - TypeInfo type = HiveSchemaConverter.convert(SchemaBuilder.array(Schema.FLOAT32_SCHEMA)); - assertEquals(TypeInfoFactory.getListTypeInfo(TypeInfoFactory.floatTypeInfo), type); - } - - @Test - public void testConvertMap() { - TypeInfo type = HiveSchemaConverter.convert(SchemaBuilder.map(Schema.STRING_SCHEMA, Schema.FLOAT64_SCHEMA)); - assertEquals(TypeInfoFactory.getMapTypeInfo(TypeInfoFactory.stringTypeInfo, TypeInfoFactory.doubleTypeInfo), type); - - } -} diff --git a/src/test/java/io/confluent/connect/hdfs/hive/HiveTestBase.java b/src/test/java/io/confluent/connect/hdfs/hive/HiveTestBase.java index 3b8704f32..fc2b63f21 100644 --- a/src/test/java/io/confluent/connect/hdfs/hive/HiveTestBase.java +++ b/src/test/java/io/confluent/connect/hdfs/hive/HiveTestBase.java @@ -15,12 +15,11 @@ package io.confluent.connect.hdfs.hive; import org.junit.After; -import org.junit.Before; import java.util.Map; -import io.confluent.connect.hdfs.HdfsSinkConnectorConfig; import io.confluent.connect.hdfs.TestWithMiniDFSCluster; +import io.confluent.connect.storage.hive.HiveConfig; public class HiveTestBase extends TestWithMiniDFSCluster { @@ -28,10 +27,17 @@ public class HiveTestBase extends TestWithMiniDFSCluster { protected HiveMetaStore hiveMetaStore; protected HiveExec hiveExec; - @Before + @Override + protected Map createProps() { + Map props = super.createProps(); + props.put(HiveConfig.HIVE_CONF_DIR_CONFIG, "hive_conf"); + return props; + } + + //@Before should be omitted in order to be able to add properties per test. public void setUp() throws Exception { super.setUp(); - hiveDatabase = connectorConfig.getString(HdfsSinkConnectorConfig.HIVE_DATABASE_CONFIG); + hiveDatabase = connectorConfig.getString(HiveConfig.HIVE_DATABASE_CONFIG); hiveMetaStore = new HiveMetaStore(conf, connectorConfig); hiveExec = new HiveExec(connectorConfig); cleanHive(); @@ -43,14 +49,7 @@ public void tearDown() throws Exception { super.tearDown(); } - @Override - protected Map createProps() { - Map props = super.createProps(); - props.put(HdfsSinkConnectorConfig.HIVE_CONF_DIR_CONFIG, "hive_conf"); - return props; - } - - private void cleanHive() throws Exception { + private void cleanHive() { // ensures all tables are removed for (String database : hiveMetaStore.getAllDatabases()) { for (String table : hiveMetaStore.getAllTables(database)) { diff --git a/src/test/java/io/confluent/connect/hdfs/hive/HiveTestUtils.java b/src/test/java/io/confluent/connect/hdfs/hive/HiveTestUtils.java index 09266d9fe..c7a4e89e7 100644 --- a/src/test/java/io/confluent/connect/hdfs/hive/HiveTestUtils.java +++ b/src/test/java/io/confluent/connect/hdfs/hive/HiveTestUtils.java @@ -21,16 +21,16 @@ import java.io.ByteArrayOutputStream; import java.io.InputStreamReader; import java.io.PrintStream; -import java.util.HashMap; +import java.util.Map; import io.confluent.connect.hdfs.partitioner.DefaultPartitioner; import io.confluent.connect.hdfs.partitioner.Partitioner; public class HiveTestUtils { - public static Partitioner getPartitioner() { + public static Partitioner getPartitioner(Map parsedConfig) { Partitioner partitioner = new DefaultPartitioner(); - partitioner.configure(new HashMap()); + partitioner.configure(parsedConfig); return partitioner; } diff --git a/src/test/java/io/confluent/connect/hdfs/parquet/DataWriterParquetTest.java b/src/test/java/io/confluent/connect/hdfs/parquet/DataWriterParquetTest.java index ac4ae9930..33bef5d44 100644 --- a/src/test/java/io/confluent/connect/hdfs/parquet/DataWriterParquetTest.java +++ b/src/test/java/io/confluent/connect/hdfs/parquet/DataWriterParquetTest.java @@ -25,14 +25,13 @@ import io.confluent.connect.hdfs.DataWriter; import io.confluent.connect.hdfs.HdfsSinkConnectorConfig; import io.confluent.connect.hdfs.TestWithMiniDFSCluster; -import io.confluent.connect.hdfs.partitioner.Partitioner; public class DataWriterParquetTest extends TestWithMiniDFSCluster { @Before public void setUp() throws Exception { super.setUp(); - schemaFileReader = new ParquetFileReader(avroData); + dataFileReader = new ParquetDataFileReader(); extension = ".parquet"; } @@ -52,7 +51,7 @@ public void testWriteRecord() throws Exception { List sinkRecords = createSinkRecords(7); hdfsWriter.write(sinkRecords); - hdfsWriter.close(assignment); + hdfsWriter.close(); hdfsWriter.stop(); // Last file (offset 6) doesn't satisfy size requirement and gets discarded on close diff --git a/src/test/java/io/confluent/connect/hdfs/parquet/HiveIntegrationParquetTest.java b/src/test/java/io/confluent/connect/hdfs/parquet/HiveIntegrationParquetTest.java index 561f298df..bf234c865 100644 --- a/src/test/java/io/confluent/connect/hdfs/parquet/HiveIntegrationParquetTest.java +++ b/src/test/java/io/confluent/connect/hdfs/parquet/HiveIntegrationParquetTest.java @@ -26,6 +26,7 @@ import java.util.ArrayList; import java.util.Arrays; +import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.concurrent.TimeUnit; @@ -38,35 +39,42 @@ import io.confluent.connect.hdfs.partitioner.DailyPartitioner; import io.confluent.connect.hdfs.partitioner.FieldPartitioner; import io.confluent.connect.hdfs.partitioner.TimeUtils; +import io.confluent.connect.storage.hive.HiveConfig; +import io.confluent.connect.storage.partitioner.PartitionerConfig; import static org.junit.Assert.assertEquals; public class HiveIntegrationParquetTest extends HiveTestBase { + private Map localProps = new HashMap<>(); + @Override protected Map createProps() { Map props = super.createProps(); props.put(HdfsSinkConnectorConfig.SHUTDOWN_TIMEOUT_CONFIG, "10000"); props.put(HdfsSinkConnectorConfig.FORMAT_CLASS_CONFIG, ParquetFormat.class.getName()); + props.putAll(localProps); return props; } + //@Before should be omitted in order to be able to add properties per test. + public void setUp() throws Exception { + super.setUp(); + } + @Test public void testSyncWithHiveParquet() throws Exception { - Map props = createProps(); - HdfsSinkConnectorConfig connectorConfig = new HdfsSinkConnectorConfig(props); - + setUp(); DataWriter hdfsWriter = new DataWriter(connectorConfig, context, avroData); hdfsWriter.recover(TOPIC_PARTITION); List sinkRecords = createSinkRecords(7); hdfsWriter.write(sinkRecords); - hdfsWriter.close(assignment); + hdfsWriter.close(); hdfsWriter.stop(); - props = createProps(); - props.put(HdfsSinkConnectorConfig.HIVE_INTEGRATION_CONFIG, "true"); - HdfsSinkConnectorConfig config = new HdfsSinkConnectorConfig(props); + localProps.put(HiveConfig.HIVE_INTEGRATION_CONFIG, "true"); + HdfsSinkConnectorConfig config = new HdfsSinkConnectorConfig(createProps()); hdfsWriter = new DataWriter(config, context, avroData); hdfsWriter.syncWithHive(); @@ -95,23 +103,22 @@ public void testSyncWithHiveParquet() throws Exception { assertEquals(expectedPartitions, partitions); - hdfsWriter.close(assignment); + hdfsWriter.close(); hdfsWriter.stop(); } @Test public void testHiveIntegrationParquet() throws Exception { - Map props = createProps(); - props.put(HdfsSinkConnectorConfig.HIVE_INTEGRATION_CONFIG, "true"); - HdfsSinkConnectorConfig config = new HdfsSinkConnectorConfig(props); + localProps.put(HiveConfig.HIVE_INTEGRATION_CONFIG, "true"); + setUp(); - DataWriter hdfsWriter = new DataWriter(config, context, avroData); + DataWriter hdfsWriter = new DataWriter(connectorConfig, context, avroData); hdfsWriter.recover(TOPIC_PARTITION); List sinkRecords = createSinkRecords(7); hdfsWriter.write(sinkRecords); - hdfsWriter.close(assignment); + hdfsWriter.close(); hdfsWriter.stop(); Schema schema = createSchema(); @@ -138,20 +145,18 @@ public void testHiveIntegrationParquet() throws Exception { @Test public void testHiveIntegrationFieldPartitionerParquet() throws Exception { - Map props = createProps(); - props.put(HdfsSinkConnectorConfig.HIVE_INTEGRATION_CONFIG, "true"); - props.put(HdfsSinkConnectorConfig.PARTITIONER_CLASS_CONFIG, FieldPartitioner.class.getName()); - props.put(HdfsSinkConnectorConfig.PARTITION_FIELD_NAME_CONFIG, "int"); - - HdfsSinkConnectorConfig config = new HdfsSinkConnectorConfig(props); - DataWriter hdfsWriter = new DataWriter(config, context, avroData); + localProps.put(HiveConfig.HIVE_INTEGRATION_CONFIG, "true"); + localProps.put(PartitionerConfig.PARTITIONER_CLASS_CONFIG, FieldPartitioner.class.getName()); + localProps.put(PartitionerConfig.PARTITION_FIELD_NAME_CONFIG, "int"); + setUp(); + DataWriter hdfsWriter = new DataWriter(connectorConfig, context, avroData); Schema schema = createSchema(); List records = createRecordBatches(schema, 3, 3); List sinkRecords = createSinkRecords(records, schema); hdfsWriter.write(sinkRecords); - hdfsWriter.close(assignment); + hdfsWriter.close(); hdfsWriter.stop(); Table table = hiveMetaStore.getTable(hiveDatabase, TOPIC); @@ -167,7 +172,9 @@ public void testHiveIntegrationFieldPartitionerParquet() throws Exception { } assertEquals(expectedColumnNames, actualColumnNames); - String partitionFieldName = config.getString(HdfsSinkConnectorConfig.PARTITION_FIELD_NAME_CONFIG); + String partitionFieldName = connectorConfig.getString( + PartitionerConfig.PARTITION_FIELD_NAME_CONFIG + ); String directory1 = TOPIC + "/" + partitionFieldName + "=" + String.valueOf(16); String directory2 = TOPIC + "/" + partitionFieldName + "=" + String.valueOf(17); String directory3 = TOPIC + "/" + partitionFieldName + "=" + String.valueOf(18); @@ -192,7 +199,10 @@ public void testHiveIntegrationFieldPartitionerParquet() throws Exception { } } - String result = HiveTestUtils.runHive(hiveExec, "SELECT * FROM " + TOPIC); + String result = HiveTestUtils.runHive( + hiveExec, + "SELECT * FROM " + hiveMetaStore.tableNameConverter(TOPIC) + ); String[] rows = result.split("\n"); assertEquals(9, rows.length); for (int i = 0; i < rows.length; ++i) { @@ -206,21 +216,17 @@ public void testHiveIntegrationFieldPartitionerParquet() throws Exception { @Test public void testHiveIntegrationTimeBasedPartitionerParquet() throws Exception { - Map props = createProps(); - props.put(HdfsSinkConnectorConfig.HIVE_INTEGRATION_CONFIG, "true"); - props.put(HdfsSinkConnectorConfig.PARTITIONER_CLASS_CONFIG, DailyPartitioner.class.getName()); - props.put(HdfsSinkConnectorConfig.TIMEZONE_CONFIG, "America/Los_Angeles"); - props.put(HdfsSinkConnectorConfig.LOCALE_CONFIG, "en"); - - HdfsSinkConnectorConfig config = new HdfsSinkConnectorConfig(props); - DataWriter hdfsWriter = new DataWriter(config, context, avroData); + localProps.put(HiveConfig.HIVE_INTEGRATION_CONFIG, "true"); + localProps.put(PartitionerConfig.PARTITIONER_CLASS_CONFIG, DailyPartitioner.class.getName()); + setUp(); + DataWriter hdfsWriter = new DataWriter(connectorConfig, context, avroData); Schema schema = createSchema(); List records = createRecordBatches(schema, 3, 3); List sinkRecords = createSinkRecords(records, schema); hdfsWriter.write(sinkRecords); - hdfsWriter.close(assignment); + hdfsWriter.close(); hdfsWriter.stop(); Table table = hiveMetaStore.getTable(hiveDatabase, TOPIC); @@ -270,13 +276,17 @@ public void testHiveIntegrationTimeBasedPartitionerParquet() throws Exception { } } - String result = HiveTestUtils.runHive(hiveExec, "SELECT * FROM " + TOPIC); + String result = HiveTestUtils.runHive( + hiveExec, + "SELECT * FROM " + hiveMetaStore.tableNameConverter(TOPIC) + ); String[] rows = result.split("\n"); assertEquals(9, rows.length); for (int i = 0; i < rows.length; ++i) { String[] parts = HiveTestUtils.parseOutput(rows[i]); int j = 0; for (String expectedValue : expectedResults.get(i)) { + System.err.println("Exp: " + expectedValue + " Actual: " + parts[j]); assertEquals(expectedValue, parts[j++]); } } diff --git a/src/test/java/io/confluent/connect/hdfs/parquet/ParquetDataFileReader.java b/src/test/java/io/confluent/connect/hdfs/parquet/ParquetDataFileReader.java new file mode 100644 index 000000000..32f71e7d7 --- /dev/null +++ b/src/test/java/io/confluent/connect/hdfs/parquet/ParquetDataFileReader.java @@ -0,0 +1,29 @@ +package io.confluent.connect.hdfs.parquet; + +import org.apache.avro.generic.GenericRecord; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.Path; +import org.apache.parquet.avro.AvroReadSupport; +import org.apache.parquet.hadoop.ParquetReader; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Collection; + +import io.confluent.connect.hdfs.DataFileReader; + +public class ParquetDataFileReader implements DataFileReader { + @Override + public Collection readData(Configuration conf, Path path) throws IOException { + Collection result = new ArrayList<>(); + AvroReadSupport readSupport = new AvroReadSupport<>(); + ParquetReader.Builder builder = ParquetReader.builder(readSupport, path); + ParquetReader parquetReader = builder.withConf(conf).build(); + GenericRecord record; + while ((record = parquetReader.read()) != null) { + result.add(record); + } + parquetReader.close(); + return result; + } +} diff --git a/src/test/java/io/confluent/connect/hdfs/parquet/ParquetHiveUtilTest.java b/src/test/java/io/confluent/connect/hdfs/parquet/ParquetHiveUtilTest.java index 2467cf916..c5cf288c4 100644 --- a/src/test/java/io/confluent/connect/hdfs/parquet/ParquetHiveUtilTest.java +++ b/src/test/java/io/confluent/connect/hdfs/parquet/ParquetHiveUtilTest.java @@ -27,6 +27,7 @@ import java.util.ArrayList; import java.util.Collection; +import java.util.HashMap; import java.util.List; import java.util.Map; @@ -41,28 +42,28 @@ import static org.junit.Assert.assertEquals; public class ParquetHiveUtilTest extends HiveTestBase { - private HiveUtil hive; + private Map localProps = new HashMap<>(); @Override protected Map createProps() { Map props = super.createProps(); props.put(HdfsSinkConnectorConfig.FORMAT_CLASS_CONFIG, ParquetFormat.class.getName()); + props.putAll(localProps); return props; } - @Before + //@Before should be omitted in order to be able to add properties per test. public void setUp() throws Exception { super.setUp(); - Map props = createProps(); - HdfsSinkConnectorConfig connectorConfig = new HdfsSinkConnectorConfig(props); - hive = new ParquetHiveUtil(connectorConfig, avroData, hiveMetaStore); + hive = new ParquetHiveUtil(connectorConfig, hiveMetaStore); } @Test public void testCreateTable() throws Exception { + setUp(); prepareData(TOPIC, PARTITION); - Partitioner partitioner = HiveTestUtils.getPartitioner(); + Partitioner partitioner = HiveTestUtils.getPartitioner(parsedConfig); Schema schema = createSchema(); hive.createTable(hiveDatabase, TOPIC, schema, partitioner); @@ -88,7 +89,10 @@ public void testCreateTable() throws Exception { assertEquals(1, partitionCols.size()); assertEquals("partition", partitionCols.get(0).getName()); - String result = HiveTestUtils.runHive(hiveExec, "SELECT * from " + TOPIC); + String result = HiveTestUtils.runHive( + hiveExec, + "SELECT * from " + hiveMetaStore.tableNameConverter(TOPIC) + ); String[] rows = result.split("\n"); // Only 6 of the 7 records should have been delivered due to flush_size = 3 assertEquals(6, rows.length); @@ -103,8 +107,9 @@ public void testCreateTable() throws Exception { @Test public void testAlterSchema() throws Exception { + setUp(); prepareData(TOPIC, PARTITION); - Partitioner partitioner = HiveTestUtils.getPartitioner(); + Partitioner partitioner = HiveTestUtils.getPartitioner(parsedConfig); Schema schema = createSchema(); hive.createTable(hiveDatabase, TOPIC, schema, partitioner); @@ -130,7 +135,10 @@ public void testAlterSchema() throws Exception { hive.alterSchema(hiveDatabase, TOPIC, newSchema); - String result = HiveTestUtils.runHive(hiveExec, "SELECT * from " + TOPIC); + String result = HiveTestUtils.runHive( + hiveExec, + "SELECT * from " + hiveMetaStore.tableNameConverter(TOPIC) + ); String[] rows = result.split("\n"); // Only 6 of the 7 records should have been delivered due to flush_size = 3 assertEquals(6, rows.length); @@ -143,7 +151,7 @@ public void testAlterSchema() throws Exception { } } - private void prepareData(String topic, int partition) throws Exception { + private void prepareData(String topic, int partition) { TopicPartition tp = new TopicPartition(topic, partition); DataWriter hdfsWriter = createWriter(context, avroData); hdfsWriter.recover(tp); @@ -158,13 +166,11 @@ private void prepareData(String topic, int partition) throws Exception { sinkRecords.add(sinkRecord); } hdfsWriter.write(sinkRecords); - hdfsWriter.close(assignment); + hdfsWriter.close(); hdfsWriter.stop(); } private DataWriter createWriter(SinkTaskContext context, AvroData avroData) { - Map props = createProps(); - HdfsSinkConnectorConfig connectorConfig = new HdfsSinkConnectorConfig(props); return new DataWriter(connectorConfig, context, avroData); } } diff --git a/src/test/java/io/confluent/connect/hdfs/partitioner/DailyPartitionerTest.java b/src/test/java/io/confluent/connect/hdfs/partitioner/DailyPartitionerTest.java index 416de6d10..c80e31d88 100644 --- a/src/test/java/io/confluent/connect/hdfs/partitioner/DailyPartitionerTest.java +++ b/src/test/java/io/confluent/connect/hdfs/partitioner/DailyPartitionerTest.java @@ -23,32 +23,27 @@ import java.util.concurrent.TimeUnit; import io.confluent.connect.hdfs.HdfsSinkConnectorConfig; +import io.confluent.connect.hdfs.TestWithMiniDFSCluster; +import io.confluent.connect.storage.hive.schema.TimeBasedSchemaGenerator; +import io.confluent.connect.storage.partitioner.PartitionerConfig; import static org.junit.Assert.assertEquals; -public class DailyPartitionerTest { - +public class DailyPartitionerTest extends TestWithMiniDFSCluster { private static final long partitionDurationMs = TimeUnit.HOURS.toMillis(24); @Test public void testDailyPartitioner() throws Exception { - Map config = createConfig(); - + setUp(); DailyPartitioner partitioner = new DailyPartitioner(); - partitioner.configure(config); + partitioner.configure(parsedConfig); String pathFormat = partitioner.getPathFormat(); - String timeZoneString = (String) config.get(HdfsSinkConnectorConfig.TIMEZONE_CONFIG); + String timeZoneString = (String) parsedConfig.get(PartitionerConfig.TIMEZONE_CONFIG); long timestamp = new DateTime(2014, 2, 1, 3, 0, 0, 0, DateTimeZone.forID(timeZoneString)).getMillis(); String encodedPartition = TimeUtils.encodeTimestamp(partitionDurationMs, pathFormat, timeZoneString, timestamp); String path = partitioner.generatePartitionedPath("topic", encodedPartition); - assertEquals("topic/year=2014/month=02/day=01/", path); + assertEquals("topic/year=2014/month=02/day=01", path); } - private Map createConfig() { - Map config = new HashMap<>(); - config.put(HdfsSinkConnectorConfig.LOCALE_CONFIG, "en"); - config.put(HdfsSinkConnectorConfig.TIMEZONE_CONFIG, "America/Los_Angeles"); - return config; - } } diff --git a/src/test/java/io/confluent/connect/hdfs/partitioner/HourlyPartitionerTest.java b/src/test/java/io/confluent/connect/hdfs/partitioner/HourlyPartitionerTest.java index bfadc1707..e541b17f0 100644 --- a/src/test/java/io/confluent/connect/hdfs/partitioner/HourlyPartitionerTest.java +++ b/src/test/java/io/confluent/connect/hdfs/partitioner/HourlyPartitionerTest.java @@ -23,33 +23,28 @@ import java.util.concurrent.TimeUnit; import io.confluent.connect.hdfs.HdfsSinkConnectorConfig; +import io.confluent.connect.hdfs.TestWithMiniDFSCluster; +import io.confluent.connect.storage.hive.schema.TimeBasedSchemaGenerator; +import io.confluent.connect.storage.partitioner.PartitionerConfig; import static org.junit.Assert.assertEquals; -public class HourlyPartitionerTest { - +public class HourlyPartitionerTest extends TestWithMiniDFSCluster { private static final long partitionDurationMs = TimeUnit.HOURS.toMillis(1); @Test public void testHourlyPartitioner() throws Exception { - Map config = createConfig(); - + setUp(); HourlyPartitioner partitioner = new HourlyPartitioner(); - partitioner.configure(config); + partitioner.configure(parsedConfig); String pathFormat = partitioner.getPathFormat(); - String timeZoneString = (String) config.get(HdfsSinkConnectorConfig.TIMEZONE_CONFIG); + String timeZoneString = (String) parsedConfig.get(PartitionerConfig.TIMEZONE_CONFIG); long timestamp = new DateTime(2015, 2, 1, 3, 0, 0, 0, DateTimeZone.forID(timeZoneString)).getMillis(); String encodedPartition = TimeUtils.encodeTimestamp(partitionDurationMs, pathFormat, timeZoneString, timestamp); String path = partitioner.generatePartitionedPath("topic", encodedPartition); - assertEquals("topic/year=2015/month=02/day=01/hour=03/", path); + assertEquals("topic/year=2015/month=02/day=01/hour=03", path); } - private Map createConfig() { - Map config = new HashMap<>(); - config.put(HdfsSinkConnectorConfig.LOCALE_CONFIG, "en"); - config.put(HdfsSinkConnectorConfig.TIMEZONE_CONFIG, "America/Los_Angeles"); - return config; - } } diff --git a/src/test/java/io/confluent/connect/hdfs/partitioner/TimeBasedPartitionerTest.java b/src/test/java/io/confluent/connect/hdfs/partitioner/TimeBasedPartitionerTest.java index 4e4f3fbdb..3f765a131 100644 --- a/src/test/java/io/confluent/connect/hdfs/partitioner/TimeBasedPartitionerTest.java +++ b/src/test/java/io/confluent/connect/hdfs/partitioner/TimeBasedPartitionerTest.java @@ -24,16 +24,21 @@ import java.util.Map; import java.util.concurrent.TimeUnit; +import io.confluent.connect.hdfs.HdfsSinkConnectorTestBase; +import io.confluent.connect.storage.hive.schema.TimeBasedSchemaGenerator; +import io.confluent.connect.storage.partitioner.PartitionerConfig; + import static org.junit.Assert.assertEquals; -public class TimeBasedPartitionerTest { +public class TimeBasedPartitionerTest extends HdfsSinkConnectorTestBase { private static final String timeZoneString = "America/Los_Angeles"; private static final DateTimeZone DATE_TIME_ZONE = DateTimeZone.forID(timeZoneString); - private BiHourlyPartitioner partitioner = new BiHourlyPartitioner(); @Test public void testGeneratePartitionedPath() throws Exception { - partitioner.configure(null); + setUp(); + BiHourlyPartitioner partitioner = new BiHourlyPartitioner(); + partitioner.configure(parsedConfig); String pathFormat = partitioner.getPathFormat(); long partitionDurationMs = TimeUnit.HOURS.toMillis(2); long timestamp = new DateTime(2015, 1, 1, 3, 0, 0, 0, DateTimeZone.forID(timeZoneString)).getMillis(); @@ -60,7 +65,7 @@ private static class BiHourlyPartitioner extends TimeBasedPartitioner { @Override public void configure(Map config) { - init(partitionDurationMs, pathFormat, Locale.FRENCH, DATE_TIME_ZONE, true); + init(partitionDurationMs, pathFormat, Locale.FRENCH, DATE_TIME_ZONE, config); } public String getPathFormat() { diff --git a/src/test/java/io/confluent/connect/hdfs/utils/MemoryFormat.java b/src/test/java/io/confluent/connect/hdfs/utils/MemoryFormat.java index bf077ab99..a283f46e5 100644 --- a/src/test/java/io/confluent/connect/hdfs/utils/MemoryFormat.java +++ b/src/test/java/io/confluent/connect/hdfs/utils/MemoryFormat.java @@ -1,24 +1,40 @@ package io.confluent.connect.hdfs.utils; -import io.confluent.connect.avro.AvroData; -import io.confluent.connect.hdfs.Format; +import org.apache.hadoop.fs.Path; +import org.apache.kafka.common.config.AbstractConfig; + import io.confluent.connect.hdfs.HdfsSinkConnectorConfig; -import io.confluent.connect.hdfs.RecordWriterProvider; -import io.confluent.connect.hdfs.SchemaFileReader; -import io.confluent.connect.hdfs.hive.HiveMetaStore; -import io.confluent.connect.hdfs.hive.HiveUtil; +import io.confluent.connect.hdfs.storage.HdfsStorage; +import io.confluent.connect.storage.hive.HiveFactory; + +public class MemoryFormat + implements io.confluent.connect.storage.format.Format { -public class MemoryFormat implements Format { + // DO NOT change this signature, it is required for instantiation via reflection + public MemoryFormat(HdfsStorage storage) { + } - public RecordWriterProvider getRecordWriterProvider() { + @Override + public io.confluent.connect.storage.format.RecordWriterProvider getRecordWriterProvider() { return new MemoryRecordWriterProvider(); } - public SchemaFileReader getSchemaFileReader(AvroData avroData) { + @Override + public io.confluent.connect.storage.format.SchemaFileReader + getSchemaFileReader() { return null; } - public HiveUtil getHiveUtil(HdfsSinkConnectorConfig config, AvroData avroData, HiveMetaStore hiveMetaStore) { - return null; + @Override + public HiveFactory getHiveFactory() { + return new HiveFactory() { + @Override + public io.confluent.connect.storage.hive.HiveUtil createHiveUtil( + AbstractConfig abstractConfig, + io.confluent.connect.storage.hive.HiveMetaStore hiveMetaStore + ) { + return null; + } + }; } } diff --git a/src/test/java/io/confluent/connect/hdfs/utils/MemoryRecordWriter.java b/src/test/java/io/confluent/connect/hdfs/utils/MemoryRecordWriter.java index 7ebbc6cec..13fe5ef3b 100644 --- a/src/test/java/io/confluent/connect/hdfs/utils/MemoryRecordWriter.java +++ b/src/test/java/io/confluent/connect/hdfs/utils/MemoryRecordWriter.java @@ -14,15 +14,15 @@ package io.confluent.connect.hdfs.utils; +import org.apache.kafka.connect.errors.ConnectException; import org.apache.kafka.connect.sink.SinkRecord; -import java.io.IOException; import java.util.List; import java.util.Map; import io.confluent.connect.hdfs.RecordWriter; -public class MemoryRecordWriter implements RecordWriter { +public class MemoryRecordWriter implements io.confluent.connect.storage.format.RecordWriter { private String filename; private static final Map> data = Data.getData(); private Failure failure = Failure.noFailure; @@ -38,20 +38,23 @@ public MemoryRecordWriter(String filename) { } @Override - public void write(SinkRecord record) throws IOException { + public void write(SinkRecord record) { if (failure == Failure.writeFailure) { failure = Failure.noFailure; - throw new IOException("write failed."); + throw new ConnectException("write failed."); } data.get(filename).add(record); } @Override - public void close() throws IOException { + public void commit() {} + + @Override + public void close() { if (failure == Failure.closeFailure) { failure = Failure.noFailure; - throw new IOException("close failed."); + throw new ConnectException("close failed."); } } diff --git a/src/test/java/io/confluent/connect/hdfs/utils/MemoryRecordWriterProvider.java b/src/test/java/io/confluent/connect/hdfs/utils/MemoryRecordWriterProvider.java index d173eb526..a4a5f0b9f 100644 --- a/src/test/java/io/confluent/connect/hdfs/utils/MemoryRecordWriterProvider.java +++ b/src/test/java/io/confluent/connect/hdfs/utils/MemoryRecordWriterProvider.java @@ -16,19 +16,14 @@ package io.confluent.connect.hdfs.utils; -import org.apache.hadoop.conf.Configuration; -import org.apache.kafka.connect.sink.SinkRecord; - -import java.io.IOException; import java.util.LinkedList; import java.util.List; import java.util.Map; -import io.confluent.connect.avro.AvroData; -import io.confluent.connect.hdfs.RecordWriter; -import io.confluent.connect.hdfs.RecordWriterProvider; +import io.confluent.connect.hdfs.HdfsSinkConnectorConfig; -public class MemoryRecordWriterProvider implements RecordWriterProvider { +public class MemoryRecordWriterProvider + implements io.confluent.connect.storage.format.RecordWriterProvider { @Override public String getExtension() { @@ -36,18 +31,17 @@ public String getExtension() { } @Override - public RecordWriter getRecordWriter( - Configuration conf, final String fileName, SinkRecord record, final AvroData avroData) - throws IOException { - + public io.confluent.connect.storage.format.RecordWriter getRecordWriter( + HdfsSinkConnectorConfig conf, + final String filename + ) { final Map> data = Data.getData(); - if (!data.containsKey(fileName)) { - data.put(fileName, new LinkedList<>()); + if (!data.containsKey(filename)) { + data.put(filename, new LinkedList<>()); } - return new MemoryRecordWriter(fileName); + return new MemoryRecordWriter(filename); } - } diff --git a/src/test/java/io/confluent/connect/hdfs/utils/MemoryStorage.java b/src/test/java/io/confluent/connect/hdfs/utils/MemoryStorage.java index 4b892d0da..7c459c313 100644 --- a/src/test/java/io/confluent/connect/hdfs/utils/MemoryStorage.java +++ b/src/test/java/io/confluent/connect/hdfs/utils/MemoryStorage.java @@ -16,25 +16,28 @@ package io.confluent.connect.hdfs.utils; +import org.apache.avro.file.SeekableInput; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.Path; import org.apache.hadoop.fs.PathFilter; import org.apache.kafka.common.TopicPartition; +import org.apache.kafka.connect.errors.ConnectException; -import java.io.IOException; +import java.io.OutputStream; import java.util.ArrayList; import java.util.LinkedList; import java.util.List; import java.util.Map; -import io.confluent.connect.hdfs.storage.Storage; +import io.confluent.connect.hdfs.HdfsSinkConnectorConfig; +import io.confluent.connect.hdfs.storage.HdfsStorage; import io.confluent.connect.hdfs.wal.WAL; -public class MemoryStorage implements Storage { +public class MemoryStorage extends HdfsStorage { private static final Map> data = Data.getData(); - private Configuration conf; + private HdfsSinkConnectorConfig conf; private String url; private Failure failure = Failure.noFailure; @@ -49,13 +52,14 @@ public enum Failure { closeFailure } - public MemoryStorage(Configuration conf, String url) { + public MemoryStorage(HdfsSinkConnectorConfig conf, String url) { + super(conf, url, null); this.conf = conf; this.url = url; } @Override - public FileStatus[] listStatus(String path) throws IOException { + public List list(String path) { List result = new ArrayList<>(); for (String key: data.keySet()) { if (key.startsWith(path)) { @@ -63,14 +67,13 @@ public FileStatus[] listStatus(String path) throws IOException { result.add(status); } } - return result.toArray(new FileStatus[result.size()]); + return result; } - @Override - public FileStatus[] listStatus(String path, PathFilter filter) throws IOException { + public List list(String path, PathFilter filter) { if (failure == Failure.listStatusFailure) { failure = Failure.noFailure; - throw new IOException("listStatus failed."); + throw new ConnectException("listStatus failed."); } List result = new ArrayList<>(); for (String key: data.keySet()) { @@ -79,14 +82,13 @@ public FileStatus[] listStatus(String path, PathFilter filter) throws IOExceptio result.add(status); } } - return result.toArray(new FileStatus[result.size()]); + return result; } - @Override - public void append(String filename, Object object) throws IOException { + public void append(String filename, Object object) { if (failure == Failure.appendFailure) { failure = Failure.noFailure; - throw new IOException("append failed."); + throw new ConnectException("append failed."); } if (!data.containsKey(filename)) { data.put(filename, new LinkedList<>()); @@ -95,28 +97,28 @@ public void append(String filename, Object object) throws IOException { } @Override - public boolean mkdirs(String filename) throws IOException { + public boolean create(String filename) { if (failure == Failure.mkdirsFailure) { failure = Failure.noFailure; - throw new IOException("mkdirs failed."); + throw new ConnectException("mkdirs failed."); } return true; } @Override - public boolean exists(String filename) throws IOException { + public boolean exists(String filename) { if (failure == Failure.existsFailure) { failure = Failure.noFailure; - throw new IOException("exists failed."); + throw new ConnectException("exists failed."); } return data.containsKey(filename); } @Override - public void delete(String filename) throws IOException { + public void delete(String filename) { if (failure == Failure.deleteFailure) { failure = Failure.noFailure; - throw new IOException("delete failed."); + throw new ConnectException("delete failed."); } if (data.containsKey(filename)) { data.get(filename).clear(); @@ -125,10 +127,10 @@ public void delete(String filename) throws IOException { } @Override - public void commit(String tempFile, String committedFile) throws IOException { + public void commit(String tempFile, String committedFile) { if (failure == Failure.commitFailure) { failure = Failure.noFailure; - throw new IOException("commit failed."); + throw new ConnectException("commit failed."); } if (!data.containsKey(committedFile)) { List entryList = data.get(tempFile); @@ -138,10 +140,10 @@ public void commit(String tempFile, String committedFile) throws IOException { } @Override - public void close() throws IOException { + public void close() { if (failure == Failure.closeFailure) { failure = Failure.noFailure; - throw new IOException("close failed."); + throw new ConnectException("close failed."); } data.clear(); } @@ -152,7 +154,7 @@ public WAL wal(String topicsDir, TopicPartition topicPart) { } @Override - public Configuration conf() { + public HdfsSinkConnectorConfig conf() { return conf; } @@ -164,4 +166,14 @@ public String url() { public void setFailure(Failure failure) { this.failure = failure; } + + @Override + public SeekableInput open(String filename, HdfsSinkConnectorConfig conf) { + return null; + } + + @Override + public OutputStream create(String filename, HdfsSinkConnectorConfig conf, boolean overwrite) { + return null; + } } diff --git a/src/test/java/io/confluent/connect/hdfs/utils/MemoryWAL.java b/src/test/java/io/confluent/connect/hdfs/utils/MemoryWAL.java index d522208cf..e16add5dc 100644 --- a/src/test/java/io/confluent/connect/hdfs/utils/MemoryWAL.java +++ b/src/test/java/io/confluent/connect/hdfs/utils/MemoryWAL.java @@ -19,7 +19,6 @@ import org.apache.kafka.common.TopicPartition; import org.apache.kafka.connect.errors.ConnectException; -import java.io.IOException; import java.util.List; import java.util.Map; @@ -30,10 +29,10 @@ public class MemoryWAL implements WAL { private String logFile; - private Storage storage; + private MemoryStorage storage; private static Map> data = Data.getData(); - public MemoryWAL(String topicsDir, TopicPartition topicPart, Storage storage) + public MemoryWAL(String topicsDir, TopicPartition topicPart, MemoryStorage storage) throws ConnectException { this.storage = storage; String url = storage.url(); @@ -48,46 +47,30 @@ public void acquireLease() throws ConnectException { @Override public void append(String tempFile, String committedFile) throws ConnectException { - try { - LogEntry entry = new LogEntry(tempFile, committedFile); - storage.append(logFile, entry); - } catch (IOException e) { - throw new ConnectException(e); - } + LogEntry entry = new LogEntry(tempFile, committedFile); + storage.append(logFile, entry); } @Override public void apply() throws ConnectException { - try { - if (data.containsKey(logFile)) { - List entryList = data.get(logFile); - for (Object entry : entryList) { - LogEntry logEntry = (LogEntry) entry; - storage.commit(logEntry.key(), logEntry.value()); - } + if (data.containsKey(logFile)) { + List entryList = data.get(logFile); + for (Object entry : entryList) { + LogEntry logEntry = (LogEntry) entry; + storage.commit(logEntry.key(), logEntry.value()); } - } catch (IOException e) { - throw new ConnectException(e); } } @Override public void truncate() throws ConnectException { - try { - storage.commit(logFile, logFile + ".1"); - storage.delete(logFile); - } catch (IOException e) { - throw new ConnectException(e); - } + storage.commit(logFile, logFile + ".1"); + storage.delete(logFile); } @Override public void close() throws ConnectException { - try { - storage.close(); - } catch (IOException e) { - throw new ConnectException(e); - } + storage.close(); } @Override diff --git a/src/test/java/io/confluent/connect/hdfs/wal/FSWALTest.java b/src/test/java/io/confluent/connect/hdfs/wal/FSWALTest.java index edcca1737..6eaebf6a8 100644 --- a/src/test/java/io/confluent/connect/hdfs/wal/FSWALTest.java +++ b/src/test/java/io/confluent/connect/hdfs/wal/FSWALTest.java @@ -29,7 +29,8 @@ public class FSWALTest extends TestWithMiniDFSCluster { @Test public void testTruncate() throws Exception { - Storage storage = new HdfsStorage(conf, url); + setUp(); + HdfsStorage storage = new HdfsStorage(connectorConfig, url); TopicPartition tp = new TopicPartition("mytopic", 123); FSWAL wal = new FSWAL("/logs", tp, storage); wal.append("a", "b"); diff --git a/src/test/java/io/confluent/connect/hdfs/wal/WALFileTest.java b/src/test/java/io/confluent/connect/hdfs/wal/WALFileTest.java index 4bb6bc629..b29f2e3c6 100644 --- a/src/test/java/io/confluent/connect/hdfs/wal/WALFileTest.java +++ b/src/test/java/io/confluent/connect/hdfs/wal/WALFileTest.java @@ -26,6 +26,7 @@ import io.confluent.connect.hdfs.FileUtils; import io.confluent.connect.hdfs.HdfsSinkConnectorConfig; import io.confluent.connect.hdfs.TestWithMiniDFSCluster; +import io.confluent.connect.storage.common.StorageCommonConfig; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertNull; @@ -33,18 +34,18 @@ public class WALFileTest extends TestWithMiniDFSCluster { @Test - public void testeAppend() throws Exception { - Map props = createProps(); - HdfsSinkConnectorConfig connectorConfig = new HdfsSinkConnectorConfig(props); + public void testAppend() throws Exception { + setUp(); + HdfsSinkConnectorConfig connectorConfig = new HdfsSinkConnectorConfig(properties); - String topicsDir = connectorConfig.getString(HdfsSinkConnectorConfig.TOPICS_DIR_CONFIG); + String topicsDir = connectorConfig.getString(StorageCommonConfig.TOPICS_DIR_CONFIG); String topic = "topic"; int partition = 0; TopicPartition topicPart = new TopicPartition(topic, partition); Path file = new Path(FileUtils.logFileName(url, topicsDir, topicPart)); - WALFile.Writer writer = WALFile.createWriter(conf, WALFile.Writer.file(file)); + WALFile.Writer writer = WALFile.createWriter(connectorConfig, WALFile.Writer.file(file)); WALEntry key1 = new WALEntry("key1"); WALEntry val1 = new WALEntry("val1"); @@ -58,7 +59,11 @@ public void testeAppend() throws Exception { verify2Values(file); - writer = WALFile.createWriter(conf, WALFile.Writer.file(file), WALFile.Writer.appendIfExists(true)); + writer = WALFile.createWriter( + connectorConfig, + WALFile.Writer.file(file), + WALFile.Writer.appendIfExists(true) + ); WALEntry key3 = new WALEntry("key3"); WALEntry val3 = new WALEntry("val3"); diff --git a/src/test/java/io/confluent/connect/hdfs/wal/WALTest.java b/src/test/java/io/confluent/connect/hdfs/wal/WALTest.java index 114843a88..323b73e8d 100644 --- a/src/test/java/io/confluent/connect/hdfs/wal/WALTest.java +++ b/src/test/java/io/confluent/connect/hdfs/wal/WALTest.java @@ -18,44 +18,57 @@ import org.apache.kafka.connect.errors.ConnectException; import org.junit.Test; -import io.confluent.connect.hdfs.storage.StorageFactory; import io.confluent.connect.hdfs.FileUtils; import io.confluent.connect.hdfs.HdfsSinkConnectorConfig; import io.confluent.connect.hdfs.TestWithMiniDFSCluster; -import io.confluent.connect.hdfs.storage.Storage; +import io.confluent.connect.hdfs.storage.HdfsStorage; +import io.confluent.connect.storage.common.StorageCommonConfig; +import io.confluent.connect.storage.wal.WAL; import static org.junit.Assert.assertFalse; import static org.junit.Assert.assertTrue; public class WALTest extends TestWithMiniDFSCluster { private static final String ZERO_PAD_FMT = "%010d"; + private HdfsStorage storage; private boolean closed; private static final String extension = ".avro"; @Test public void testWALMultiClient() throws Exception { + setUp(); fs.delete(new Path(FileUtils.directoryName(url, topicsDir, TOPIC_PARTITION)), true); @SuppressWarnings("unchecked") - Class storageClass = (Class) - Class.forName(connectorConfig.getString(HdfsSinkConnectorConfig.STORAGE_CLASS_CONFIG)); - Storage storage = StorageFactory.createStorage(storageClass, conf, url); - + Class storageClass = (Class) connectorConfig + .getClass(StorageCommonConfig.STORAGE_CLASS_CONFIG); + storage = io.confluent.connect.storage.StorageFactory.createStorage( + storageClass, + HdfsSinkConnectorConfig.class, + connectorConfig, + url + ); final WAL wal1 = storage.wal(topicsDir, TOPIC_PARTITION); final WAL wal2 = storage.wal(topicsDir, TOPIC_PARTITION); String directory = TOPIC + "/" + String.valueOf(PARTITION); final String tempfile = FileUtils.tempFileName(url, topicsDir, directory, extension); - final String commitedFile = FileUtils.committedFileName(url, topicsDir, directory, - TOPIC_PARTITION, 0, 10, extension, - ZERO_PAD_FMT); - + final String committedFile = FileUtils.committedFileName( + url, + topicsDir, + directory, + TOPIC_PARTITION, + 0, + 10, + extension, + ZERO_PAD_FMT + ); fs.createNewFile(new Path(tempfile)); wal1.acquireLease(); wal1.append(WAL.beginMarker, ""); - wal1.append(tempfile, commitedFile); + wal1.append(tempfile, committedFile); wal1.append(WAL.endMarker, ""); Thread thread = new Thread(new Runnable() { @@ -78,7 +91,7 @@ public void run() { wal2.apply(); wal2.close(); - assertTrue(fs.exists(new Path(commitedFile))); + assertTrue(fs.exists(new Path(committedFile))); assertFalse(fs.exists(new Path(tempfile))); storage.close(); }