-
Notifications
You must be signed in to change notification settings - Fork 6
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
chore(refactor): refactor partition generator to take any stream slic…
…er (#39) Co-authored-by: Aaron ("AJ") Steers <[email protected]> Co-authored-by: octavia-squidington-iii <[email protected]> Co-authored-by: Brian Lai <[email protected]>
- Loading branch information
1 parent
e808271
commit e27cb81
Showing
13 changed files
with
552 additions
and
295 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
85 changes: 85 additions & 0 deletions
85
airbyte_cdk/sources/declarative/stream_slicers/declarative_partition_generator.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,85 @@ | ||
# Copyright (c) 2024 Airbyte, Inc., all rights reserved. | ||
|
||
from typing import Iterable, Optional, Mapping, Any, Callable | ||
|
||
from airbyte_cdk.sources.declarative.retrievers import Retriever | ||
from airbyte_cdk.sources.message import MessageRepository | ||
from airbyte_cdk.sources.streams.concurrent.partitions.partition import Partition | ||
from airbyte_cdk.sources.streams.concurrent.partitions.partition_generator import PartitionGenerator | ||
from airbyte_cdk.sources.streams.concurrent.partitions.record import Record | ||
from airbyte_cdk.sources.streams.concurrent.partitions.stream_slicer import StreamSlicer | ||
from airbyte_cdk.sources.types import StreamSlice | ||
from airbyte_cdk.utils.slice_hasher import SliceHasher | ||
|
||
|
||
class DeclarativePartitionFactory: | ||
def __init__( | ||
self, | ||
stream_name: str, | ||
json_schema: Mapping[str, Any], | ||
retriever_factory: Callable[[], Retriever], | ||
message_repository: MessageRepository, | ||
) -> None: | ||
""" | ||
The DeclarativePartitionFactory takes a retriever_factory and not a retriever directly. The reason is that our components are not | ||
thread safe and classes like `DefaultPaginator` may not work because multiple threads can access and modify a shared field across each other. | ||
In order to avoid these problems, we will create one retriever per thread which should make the processing thread-safe. | ||
""" | ||
self._stream_name = stream_name | ||
self._json_schema = json_schema | ||
self._retriever_factory = retriever_factory | ||
self._message_repository = message_repository | ||
|
||
def create(self, stream_slice: StreamSlice) -> Partition: | ||
return DeclarativePartition( | ||
self._stream_name, | ||
self._json_schema, | ||
self._retriever_factory(), | ||
self._message_repository, | ||
stream_slice, | ||
) | ||
|
||
|
||
class DeclarativePartition(Partition): | ||
def __init__( | ||
self, | ||
stream_name: str, | ||
json_schema: Mapping[str, Any], | ||
retriever: Retriever, | ||
message_repository: MessageRepository, | ||
stream_slice: StreamSlice, | ||
): | ||
self._stream_name = stream_name | ||
self._json_schema = json_schema | ||
self._retriever = retriever | ||
self._message_repository = message_repository | ||
self._stream_slice = stream_slice | ||
self._hash = SliceHasher.hash(self._stream_name, self._stream_slice) | ||
|
||
def read(self) -> Iterable[Record]: | ||
for stream_data in self._retriever.read_records(self._json_schema, self._stream_slice): | ||
if isinstance(stream_data, Mapping): | ||
yield Record(stream_data, self) | ||
else: | ||
self._message_repository.emit_message(stream_data) | ||
|
||
def to_slice(self) -> Optional[Mapping[str, Any]]: | ||
return self._stream_slice | ||
|
||
def stream_name(self) -> str: | ||
return self._stream_name | ||
|
||
def __hash__(self) -> int: | ||
return self._hash | ||
|
||
|
||
class StreamSlicerPartitionGenerator(PartitionGenerator): | ||
def __init__( | ||
self, partition_factory: DeclarativePartitionFactory, stream_slicer: StreamSlicer | ||
) -> None: | ||
self._partition_factory = partition_factory | ||
self._stream_slicer = stream_slicer | ||
|
||
def generate(self) -> Iterable[Partition]: | ||
for stream_slice in self._stream_slicer.stream_slices(): | ||
yield self._partition_factory.create(stream_slice) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.