feat: Async Retriever add url_requester (#211)

Signed-off-by: Artem Inzhyyants <[email protected]>
airbytehq · Jan 14, 2025 · e18e407 · e18e407
1 parent e3f904a
commit e18e407
Show file tree

Hide file tree

Showing 5 changed files with 126 additions and 15 deletions.
diff --git a/airbyte_cdk/sources/declarative/declarative_component_schema.yaml b/airbyte_cdk/sources/declarative/declarative_component_schema.yaml
@@ -2977,6 +2977,11 @@ definitions:
         anyOf:
           - "$ref": "#/definitions/CustomRequester"
           - "$ref": "#/definitions/HttpRequester"
+      url_requester:
+        description: Requester component that describes how to prepare HTTP requests to send to the source API to extract the url from polling response by the completed async job.
+        anyOf:
+          - "$ref": "#/definitions/CustomRequester"
+          - "$ref": "#/definitions/HttpRequester"
       download_requester:
         description: Requester component that describes how to prepare HTTP requests to send to the source API to download the data provided by the completed async job.
         anyOf:

diff --git a/airbyte_cdk/sources/declarative/models/declarative_component_schema.py b/airbyte_cdk/sources/declarative/models/declarative_component_schema.py
@@ -737,33 +737,43 @@ class KeysToSnakeCase(BaseModel):
     parameters: Optional[Dict[str, Any]] = Field(None, alias="$parameters")
 
 
+class FlattenFields(BaseModel):
+    type: Literal["FlattenFields"]
+    flatten_lists: Optional[bool] = Field(
+        True,
+        description="Whether to flatten lists or leave it as is. Default is True.",
+        title="Flatten Lists",
+    )
+    parameters: Optional[Dict[str, Any]] = Field(None, alias="$parameters")
+
+
 class KeysReplace(BaseModel):
     type: Literal["KeysReplace"]
     old: str = Field(
         ...,
         description="Old value to replace.",
-        examples=[" ", "{{ record.id }}", "{{ config['id'] }}", "{{ stream_slice['id'] }}"],
+        examples=[
+            " ",
+            "{{ record.id }}",
+            "{{ config['id'] }}",
+            "{{ stream_slice['id'] }}",
+        ],
         title="Old value",
     )
     new: str = Field(
         ...,
         description="New value to set.",
-        examples=["_", "{{ record.id }}", "{{ config['id'] }}", "{{ stream_slice['id'] }}"],
+        examples=[
+            "_",
+            "{{ record.id }}",
+            "{{ config['id'] }}",
+            "{{ stream_slice['id'] }}",
+        ],
         title="New value",
     )
     parameters: Optional[Dict[str, Any]] = Field(None, alias="$parameters")
 
 
-class FlattenFields(BaseModel):
-    type: Literal["FlattenFields"]
-    flatten_lists: Optional[bool] = Field(
-        True,
-        description="Whether to flatten lists or leave it as is. Default is True.",
-        title="Flatten Lists",
-    )
-    parameters: Optional[Dict[str, Any]] = Field(None, alias="$parameters")
-
-
 class IterableDecoder(BaseModel):
     type: Literal["IterableDecoder"]
 
@@ -2040,6 +2050,10 @@ class AsyncRetriever(BaseModel):
         ...,
         description="Requester component that describes how to prepare HTTP requests to send to the source API to fetch the status of the running async job.",
     )
+    url_requester: Optional[Union[CustomRequester, HttpRequester]] = Field(
+        None,
+        description="Requester component that describes how to prepare HTTP requests to send to the source API to extract the url from polling response by the completed async job.",
+    )
     download_requester: Union[CustomRequester, HttpRequester] = Field(
         ...,
         description="Requester component that describes how to prepare HTTP requests to send to the source API to download the data provided by the completed async job.",

diff --git a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py
@@ -2324,6 +2324,16 @@ def create_async_retriever(
             if model.delete_requester
             else None
         )
+        url_requester = (
+            self._create_component_from_model(
+                model=model.url_requester,
+                decoder=decoder,
+                config=config,
+                name=f"job extract_url - {name}",
+            )
+            if model.url_requester
+            else None
+        )
         status_extractor = self._create_component_from_model(
             model=model.status_extractor, decoder=decoder, config=config, name=name
         )
@@ -2334,6 +2344,7 @@ def create_async_retriever(
             creation_requester=creation_requester,
             polling_requester=polling_requester,
             download_retriever=download_retriever,
+            url_requester=url_requester,
             abort_requester=abort_requester,
             delete_requester=delete_requester,
             status_extractor=status_extractor,

diff --git a/airbyte_cdk/sources/declarative/requesters/README.md b/airbyte_cdk/sources/declarative/requesters/README.md
@@ -0,0 +1,57 @@
+# AsyncHttpJobRepository sequence diagram
+
+- Components marked as optional are not required and can be ignored.
+- if `url_requester` is not provided, `urls_extractor` will get urls from the `polling_job_response`
+- interpolation_context, e.g. `create_job_response` or `polling_job_response` can be obtained from stream_slice
+
+
+```mermaid
+---
+title: AsyncHttpJobRepository Sequence Diagram
+---
+sequenceDiagram 
+    participant AsyncHttpJobRepository as AsyncOrchestrator
+    participant CreationRequester as creation_requester
+    participant PollingRequester as polling_requester
+    participant UrlRequester as url_requester (Optional)
+    participant DownloadRetriever as download_retriever
+    participant AbortRequester as abort_requester (Optional)
+    participant DeleteRequester as delete_requester (Optional)
+    participant Reporting Server as Async Reporting Server
+
+    AsyncHttpJobRepository ->> CreationRequester: Initiate job creation
+    CreationRequester ->> Reporting Server: Create job request
+    Reporting Server -->> CreationRequester: Job ID response
+    CreationRequester -->> AsyncHttpJobRepository: Job ID
+
+    loop Poll for job status
+        AsyncHttpJobRepository ->> PollingRequester: Check job status
+        PollingRequester ->> Reporting Server: Status request (interpolation_context: `create_job_response`)
+        Reporting Server -->> PollingRequester: Status response
+        PollingRequester -->> AsyncHttpJobRepository: Job status
+    end
+
+    alt Status: Ready
+        AsyncHttpJobRepository ->> UrlRequester: Request download URLs (if applicable)
+        UrlRequester ->> Reporting Server: URL request (interpolation_context: `polling_job_response`)
+        Reporting Server -->> UrlRequester: Download URLs
+        UrlRequester -->> AsyncHttpJobRepository: Download URLs
+
+        AsyncHttpJobRepository ->> DownloadRetriever: Download reports
+        DownloadRetriever ->> Reporting Server: Retrieve report data (interpolation_context: `url`)
+        Reporting Server -->> DownloadRetriever: Report data
+        DownloadRetriever -->> AsyncHttpJobRepository: Report data
+    else Status: Failed
+        AsyncHttpJobRepository ->> AbortRequester: Send abort request
+        AbortRequester ->> Reporting Server: Abort job
+        Reporting Server -->> AbortRequester: Abort confirmation
+        AbortRequester -->> AsyncHttpJobRepository: Confirmation
+    end
+
+    AsyncHttpJobRepository ->> DeleteRequester: Send delete job request
+    DeleteRequester ->> Reporting Server: Delete job
+    Reporting Server -->> DeleteRequester: Deletion confirmation
+    DeleteRequester -->> AsyncHttpJobRepository: Confirmation
+
+
+```
diff --git a/airbyte_cdk/sources/declarative/requesters/http_job_repository.py b/airbyte_cdk/sources/declarative/requesters/http_job_repository.py
@@ -31,6 +31,10 @@
 
 @dataclass
 class AsyncHttpJobRepository(AsyncJobRepository):
+    """
+    See Readme file for more details about flow.
+    """
+
     creation_requester: Requester
     polling_requester: Requester
     download_retriever: SimpleRetriever
@@ -44,6 +48,9 @@ class AsyncHttpJobRepository(AsyncJobRepository):
     record_extractor: RecordExtractor = field(
         init=False, repr=False, default_factory=lambda: ResponseToFileExtractor({})
     )
+    url_requester: Optional[Requester] = (
+        None  # use it in case polling_requester provides some <id> and extra request is needed to obtain list of urls to download from
+    )
 
     def __post_init__(self) -> None:
         self._create_job_response_by_id: Dict[str, Response] = {}
@@ -186,9 +193,7 @@ def fetch_records(self, job: AsyncJob) -> Iterable[Mapping[str, Any]]:
 
         """
 
-        for url in self.urls_extractor.extract_records(
-            self._polling_job_response_by_id[job.api_job_id()]
-        ):
+        for url in self._get_download_url(job):
             job_slice = job.job_parameters()
             stream_slice = StreamSlice(
                 partition=job_slice.partition,
@@ -231,3 +236,22 @@ def _get_create_job_stream_slice(self, job: AsyncJob) -> StreamSlice:
             cursor_slice={},
         )
         return stream_slice
+
+    def _get_download_url(self, job: AsyncJob) -> Iterable[str]:
+        if not self.url_requester:
+            url_response = self._polling_job_response_by_id[job.api_job_id()]
+        else:
+            stream_slice: StreamSlice = StreamSlice(
+                partition={
+                    "polling_job_response": self._polling_job_response_by_id[job.api_job_id()]
+                },
+                cursor_slice={},
+            )
+            url_response = self.url_requester.send_request(stream_slice=stream_slice)  # type: ignore # we expect url_requester to always be presented, otherwise raise an exception as we cannot proceed with the report
+            if not url_response:
+                raise AirbyteTracedException(
+                    internal_message="Always expect a response or an exception from url_requester",
+                    failure_type=FailureType.system_error,
+                )
+
+        yield from self.urls_extractor.extract_records(url_response)  # type: ignore # we expect urls_extractor to always return list of strings