-
Notifications
You must be signed in to change notification settings - Fork 6
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[ISSUE #10552] move stream slicer concept in concurrent CDK
- Loading branch information
Showing
12 changed files
with
384 additions
and
264 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
66 changes: 66 additions & 0 deletions
66
airbyte_cdk/sources/declarative/stream_slicers/declarative_partition_generator.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,66 @@ | ||
# Copyright (c) 2024 Airbyte, Inc., all rights reserved. | ||
|
||
from typing import Iterable, Optional, Mapping, Any | ||
|
||
from airbyte_cdk.sources.declarative.retrievers import Retriever | ||
from airbyte_cdk.sources.message import MessageRepository | ||
from airbyte_cdk.sources.streams.concurrent.partitions.partition import Partition | ||
from airbyte_cdk.sources.streams.concurrent.partitions.partition_generator import PartitionGenerator | ||
from airbyte_cdk.sources.streams.concurrent.partitions.record import Record | ||
from airbyte_cdk.sources.streams.concurrent.partitions.stream_slicer import StreamSlicer | ||
from airbyte_cdk.sources.types import StreamSlice | ||
from airbyte_cdk.utils.slice_hasher import SliceHasher | ||
|
||
|
||
class DeclarativePartitionFactory: | ||
def __init__(self, stream_name: str, json_schema: Mapping[str, Any], retriever: Retriever, message_repository: MessageRepository) -> None: | ||
self._stream_name = stream_name | ||
self._json_schema = json_schema | ||
self._retriever = retriever # FIXME: it should be a retriever_factory here to ensure that paginators and other classes don't share interal/class state | ||
self._message_repository = message_repository | ||
|
||
def create(self, stream_slice: StreamSlice) -> Partition: | ||
return DeclarativePartition( | ||
self._stream_name, | ||
self._json_schema, | ||
self._retriever, | ||
self._message_repository, | ||
stream_slice, | ||
) | ||
|
||
|
||
class DeclarativePartition(Partition): | ||
def __init__(self, stream_name: str, json_schema: Mapping[str, Any], retriever: Retriever, message_repository: MessageRepository, stream_slice: StreamSlice): | ||
self._stream_name = stream_name | ||
self._json_schema = json_schema | ||
self._retriever = retriever | ||
self._message_repository = message_repository | ||
self._stream_slice = stream_slice | ||
self._hash = SliceHasher.hash(self._stream_name, self._stream_slice) | ||
|
||
def read(self) -> Iterable[Record]: | ||
for stream_data in self._retriever.read_records(self._json_schema, self._stream_slice): | ||
if isinstance(stream_data, Mapping): | ||
# TODO validate if this is necessary: self._stream.transformer.transform(data_to_return, self._stream.get_json_schema()) | ||
yield Record(stream_data, self) | ||
else: | ||
self._message_repository.emit_message(stream_data) | ||
|
||
def to_slice(self) -> Optional[Mapping[str, Any]]: | ||
return self._stream_slice | ||
|
||
def stream_name(self) -> str: | ||
return self._stream_name | ||
|
||
def __hash__(self) -> int: | ||
return self._hash | ||
|
||
|
||
class StreamSlicerPartitionGenerator(PartitionGenerator): | ||
def __init__(self, partition_factory: DeclarativePartitionFactory, stream_slicer: StreamSlicer) -> None: | ||
self._partition_factory = partition_factory | ||
self._stream_slicer = stream_slicer | ||
|
||
def generate(self) -> Iterable[Partition]: | ||
for stream_slice in self._stream_slicer.stream_slices(): | ||
yield self._partition_factory.create(stream_slice) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
21 changes: 21 additions & 0 deletions
21
airbyte_cdk/sources/streams/concurrent/partitions/stream_slicer.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,21 @@ | ||
# Copyright (c) 2024 Airbyte, Inc., all rights reserved. | ||
|
||
from abc import ABC, abstractmethod | ||
from typing import Iterable | ||
|
||
from airbyte_cdk.sources.types import StreamSlice | ||
|
||
|
||
class StreamSlicer(ABC): | ||
""" | ||
Slices the stream into chunks that can be fetched independently. Slices enable state checkpointing and data retrieval parallelization. | ||
""" | ||
|
||
@abstractmethod | ||
def stream_slices(self) -> Iterable[StreamSlice]: | ||
""" | ||
Defines stream slices | ||
:return: An iterable of stream slices | ||
""" | ||
pass |
Oops, something went wrong.