-
Notifications
You must be signed in to change notification settings - Fork 8
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Core implementation of backup into S3
- Loading branch information
1 parent
031f4bc
commit f1fd7dd
Showing
11 changed files
with
287 additions
and
17 deletions.
There are no files selected for viewing
Empty file.
27 changes: 27 additions & 0 deletions
27
backup-s3/src/main/scala/aiven/io/guardian/kafka/backup/s3/BackupClient.scala
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,27 @@ | ||
package aiven.io.guardian.kafka.backup.s3 | ||
|
||
import aiven.io.guardian.kafka.KafkaClientInterface | ||
import aiven.io.guardian.kafka.backup.BackupClientInterface | ||
import aiven.io.guardian.kafka.backup.configs.Backup | ||
import aiven.io.guardian.kafka.s3.configs.{S3 => S3Config} | ||
import akka.stream.alpakka.s3.scaladsl.S3 | ||
import akka.stream.alpakka.s3.{MultipartUploadResult, S3Headers} | ||
import akka.stream.scaladsl._ | ||
import akka.util.ByteString | ||
|
||
import scala.concurrent.Future | ||
|
||
class BackupClient(s3Headers: S3Headers)(implicit | ||
override val kafkaClientInterface: KafkaClientInterface, | ||
override val backupConfig: Backup, | ||
s3Config: S3Config | ||
) extends BackupClientInterface { | ||
override type BackupResult = MultipartUploadResult | ||
|
||
override def backupToStorageSink(key: String): Sink[ByteString, Future[BackupResult]] = | ||
S3.multipartUploadWithHeaders( | ||
s3Config.dataBucket, | ||
key, | ||
s3Headers = s3Headers | ||
) | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
202 changes: 202 additions & 0 deletions
202
core-backup/src/main/scala/aiven/io/guardian/kafka/backup/BackupClientInterface.scala
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,202 @@ | ||
package aiven.io.guardian.kafka.backup | ||
|
||
import aiven.io.guardian.kafka.KafkaClientInterface | ||
import aiven.io.guardian.kafka.backup.configs.Backup | ||
import aiven.io.guardian.kafka.codecs.Circe._ | ||
import aiven.io.guardian.kafka.models.ReducedConsumerRecord | ||
import akka.Done | ||
import akka.stream.FlowShape | ||
import akka.stream.scaladsl._ | ||
import akka.util.ByteString | ||
import io.circe.syntax._ | ||
|
||
import java.time._ | ||
import java.time.format.DateTimeFormatter | ||
import java.time.temporal._ | ||
import scala.concurrent.Future | ||
import scala.concurrent.duration.FiniteDuration | ||
|
||
/** A marker used to indicate in which position the current backup stream is | ||
*/ | ||
sealed abstract class BackupStreamPosition | ||
|
||
object BackupStreamPosition { | ||
|
||
/** The backup stream has just started right now | ||
*/ | ||
case object Start extends BackupStreamPosition | ||
|
||
/** The backup stream is in the middle of a time period | ||
*/ | ||
case object Middle extends BackupStreamPosition | ||
|
||
/** The backup stream position has just hit a boundary for when a new period starts | ||
*/ | ||
case object Boundary extends BackupStreamPosition | ||
} | ||
|
||
trait BackupClientInterface { | ||
implicit val kafkaClientInterface: KafkaClientInterface | ||
implicit val backupConfig: Backup | ||
|
||
/** A type representing any kind of result when backing up data to a datasource | ||
*/ | ||
type BackupResult | ||
|
||
import BackupClientInterface._ | ||
|
||
/** How to backup a `ByteString` to a `DataSource` | ||
* @param key The object key or filename for what is being backed up | ||
* @return A Sink that also provides a `BackupResult` | ||
*/ | ||
def backupToStorageSink(key: String): Sink[ByteString, Future[BackupResult]] | ||
|
||
/** A Flow that both backs up the `ByteString` data to a data source and then | ||
* commits the Kafka `CursorContext` using `kafkaClientInterface.commitCursor`. | ||
* @param key They object key or filename for what is being backed up | ||
* @return The `CursorContext` which can be used for logging/debugging | ||
*/ | ||
def backupAndCommitFlow( | ||
key: String | ||
): Flow[(ByteString, kafkaClientInterface.CursorContext), kafkaClientInterface.CursorContext, Future[Done]] = { | ||
val sink = Flow.fromGraph( | ||
GraphDSL.create( | ||
backupToStorageSink(key), | ||
kafkaClientInterface.commitCursor | ||
)((_, cursorCommitted) => cursorCommitted)(implicit builder => | ||
(backupSink, commitCursor) => { | ||
import GraphDSL.Implicits._ | ||
|
||
val b = builder.add(Concat[(ByteString, kafkaClientInterface.CursorContext)]()) | ||
|
||
b.out.map(_._1) ~> backupSink | ||
b.out.map(_._2) ~> commitCursor | ||
|
||
new FlowShape(b.in(0), b.out) | ||
} | ||
) | ||
) | ||
sink.map { case (_, context) => context } | ||
} | ||
|
||
/** The entire flow that involves reading from Kafka, transforming the data into JSON and then backing it up into | ||
* a data source. | ||
* @return The `CursorContext` which can be used for logging/debugging along with the `kafkaClientInterface.Control` | ||
* which can be used to control the Stream | ||
*/ | ||
protected def backup: Source[kafkaClientInterface.CursorContext, kafkaClientInterface.Control] = { | ||
// TODO use https://awscli.amazonaws.com/v2/documentation/api/latest/reference/s3api/list-multipart-uploads.html | ||
// and https://stackoverflow.com/questions/53764876/resume-s3-multipart-upload-partetag to find any in progress | ||
// multiupload to resume from previous termination. Looks like we will have to do this manually since its not in | ||
// Alpakka yet | ||
val withPeriods = kafkaClientInterface.getSource.map { reducedConsumerRecord => | ||
val period = calculateNumberOfPeriodsFromTimestamp(reducedConsumerRecord.toOffsetDateTime, | ||
backupConfig.periodSlice, | ||
reducedConsumerRecord | ||
) | ||
(reducedConsumerRecord, period) | ||
} | ||
|
||
val withBackupStreamPositions = withPeriods | ||
.sliding(2) | ||
.map { case Seq((beforeReducedConsumerRecord, beforeDivisions), (_, afterDivisions)) => | ||
val backupStreamPosition = splitAtBoundaryCondition(beforeDivisions, afterDivisions) | ||
|
||
(beforeReducedConsumerRecord, backupStreamPosition) | ||
} | ||
.mapContext { case Seq(head, _) => head } | ||
|
||
val split = withBackupStreamPositions.asSource.splitAfter { case ((_, backupStreamPosition), _) => | ||
backupStreamPosition == BackupStreamPosition.Boundary | ||
} | ||
|
||
split | ||
.prefixAndTail(1) | ||
.flatMapConcat { case (head, restOfReducedConsumerRecords) => | ||
head.headOption match { | ||
case Some(((firstReducedConsumerRecord, _), firstContext)) => | ||
val key = calculateKey(firstReducedConsumerRecord.toOffsetDateTime) | ||
|
||
val combined = Source.combine( | ||
Source.single( | ||
( | ||
(firstReducedConsumerRecord, BackupStreamPosition.Start), | ||
firstContext | ||
) | ||
), | ||
restOfReducedConsumerRecords | ||
)(Concat(_)) | ||
|
||
val transformed = combined.map { case ((record, position), context) => | ||
val transform = transformReducedConsumerRecords(record, position) | ||
(transform, context) | ||
} | ||
|
||
transformed.via(backupAndCommitFlow(key)) | ||
case None => | ||
// TODO Is it possible to hit this branch? I assume if the Stream is started its impossible for | ||
// head to be empty | ||
??? | ||
} | ||
} | ||
.mergeSubstreams | ||
} | ||
} | ||
|
||
object BackupClientInterface { | ||
def reducedConsumerRecordAsString(reducedConsumerRecord: ReducedConsumerRecord): String = | ||
io.circe.Printer.noSpaces.print(reducedConsumerRecord.asJson) | ||
|
||
def formatOffsetDateTime(offsetDateTime: OffsetDateTime): String = | ||
offsetDateTime.format(DateTimeFormatter.ISO_OFFSET_DATE_TIME) | ||
|
||
/** Calculate an object storage key or filename for a ReducedConsumerRecord | ||
* @param offsetDateTime A given time | ||
* @return A `String` that can be used either as some object key or a filename | ||
*/ | ||
def calculateKey(offsetDateTime: OffsetDateTime) = | ||
s"${BackupClientInterface.formatOffsetDateTime(offsetDateTime)}.json" | ||
|
||
/** Calculates the current position in 2 element sliding of a Stream. | ||
* @param dividedPeriodsBefore The number of divided periods in the first element of the slide. -1 is used as a | ||
* sentinel value to indicate the start of the stream | ||
* @param dividedPeriodsAfter The number of divided periods in the second element of the slide | ||
* @return The position of the Stream | ||
*/ | ||
def splitAtBoundaryCondition(dividedPeriodsBefore: Long, dividedPeriodsAfter: Long): BackupStreamPosition = | ||
(dividedPeriodsBefore, dividedPeriodsAfter) match { | ||
case (before, _) if before == -1 => | ||
BackupStreamPosition.Start | ||
case (before, after) if after > before => | ||
BackupStreamPosition.Boundary | ||
case _ => | ||
BackupStreamPosition.Middle | ||
} | ||
|
||
/** Transforms a `ReducedConsumer` record into a ByteString so that it can be persisted into the data storage | ||
* @param reducedConsumerRecord The ReducedConsumerRecord to persist | ||
* @param backupStreamPosition The position of the record relative in the stream (so it knows if its at the start, | ||
* middle or end) | ||
* @return a `ByteString` ready to be persisted | ||
*/ | ||
def transformReducedConsumerRecords(reducedConsumerRecord: ReducedConsumerRecord, | ||
backupStreamPosition: BackupStreamPosition | ||
): ByteString = { | ||
val string = backupStreamPosition match { | ||
case BackupStreamPosition.Start => | ||
s"[${reducedConsumerRecordAsString(reducedConsumerRecord)}," | ||
case BackupStreamPosition.Middle => | ||
s"${reducedConsumerRecordAsString(reducedConsumerRecord)}," | ||
case BackupStreamPosition.Boundary => | ||
s"${reducedConsumerRecordAsString(reducedConsumerRecord)}]" | ||
} | ||
ByteString(string) | ||
} | ||
|
||
protected def calculateNumberOfPeriodsFromTimestamp(initialTime: OffsetDateTime, | ||
period: FiniteDuration, | ||
reducedConsumerRecord: ReducedConsumerRecord | ||
): Long = | ||
// TODO handle overflow? | ||
ChronoUnit.MICROS.between(reducedConsumerRecord.toOffsetDateTime, initialTime) / period.toMicros | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
12 changes: 12 additions & 0 deletions
12
core-s3/src/main/scala/aiven/io/guardian/kafka/s3/configs/S3.scala
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,12 @@ | ||
package aiven.io.guardian.kafka.s3.configs | ||
|
||
/** @param dataBucket The bucket where a Kafka Consumer directly streams data into as storage | ||
* @param dataBucketPrefix Prefix for the data bucket (if any) | ||
* @param compactionBucket The bucket where compaction results are stored | ||
* @param compactionBucketPrefix Prefix for the compaction bucket (if any) | ||
*/ | ||
final case class S3(dataBucket: String, | ||
dataBucketPrefix: Option[String], | ||
compactionBucket: String, | ||
compactionBucketPrefix: Option[String] | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
13 changes: 10 additions & 3 deletions
13
core/src/main/scala/aiven/io/guardian/kafka/KafkaClientInterface.scala
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,20 +1,27 @@ | ||
package aiven.io.guardian.kafka | ||
|
||
import aiven.io.guardian.kafka.models.ReducedConsumerRecord | ||
import akka.stream.scaladsl.SourceWithContext | ||
import akka.Done | ||
import akka.stream.scaladsl.{Sink, SourceWithContext} | ||
|
||
import scala.concurrent.Future | ||
|
||
trait KafkaClientInterface { | ||
|
||
/** The type of the context to pass around. In context of a Kafka consumer, this typically holds offset data to be | ||
* automatically committed | ||
*/ | ||
type Context | ||
type CursorContext | ||
|
||
/** The type that represents how to control the given stream, i.e. if you want to shut it down or add metrics | ||
*/ | ||
type Control | ||
|
||
/** @return A `SourceWithContext` that returns a Kafka Stream which automatically handles committing of cursors | ||
*/ | ||
def getSource: SourceWithContext[ReducedConsumerRecord, Context, Control] | ||
def getSource: SourceWithContext[ReducedConsumerRecord, CursorContext, Control] | ||
|
||
/** @return A `Sink` that allows you to commit a `CursorContext` to Kafka to signify you have processed a message | ||
*/ | ||
def commitCursor: Sink[CursorContext, Future[Done]] | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters