-
Notifications
You must be signed in to change notification settings - Fork 6
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat: add Composite Raw Decoder (#179)
Signed-off-by: Artem Inzhyyants <[email protected]>
- Loading branch information
Showing
9 changed files
with
447 additions
and
68 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -10,7 +10,7 @@ name: Packaging and Publishing | |
on: | ||
push: | ||
tags: | ||
- 'v*' | ||
- "v*" | ||
workflow_dispatch: | ||
inputs: | ||
version: | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
97 changes: 97 additions & 0 deletions
97
airbyte_cdk/sources/declarative/decoders/composite_raw_decoder.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,97 @@ | ||
import csv | ||
import gzip | ||
import json | ||
import logging | ||
from abc import ABC, abstractmethod | ||
from dataclasses import dataclass | ||
from io import BufferedIOBase, TextIOWrapper | ||
from typing import Any, Generator, MutableMapping, Optional | ||
|
||
import requests | ||
|
||
from airbyte_cdk.sources.declarative.decoders.decoder import Decoder | ||
|
||
logger = logging.getLogger("airbyte") | ||
|
||
|
||
@dataclass | ||
class Parser(ABC): | ||
@abstractmethod | ||
def parse( | ||
self, | ||
data: BufferedIOBase, | ||
) -> Generator[MutableMapping[str, Any], None, None]: | ||
""" | ||
Parse data and yield dictionaries. | ||
""" | ||
pass | ||
|
||
|
||
@dataclass | ||
class GzipParser(Parser): | ||
inner_parser: Parser | ||
|
||
def parse( | ||
self, | ||
data: BufferedIOBase, | ||
) -> Generator[MutableMapping[str, Any], None, None]: | ||
""" | ||
Decompress gzipped bytes and pass decompressed data to the inner parser. | ||
""" | ||
with gzip.GzipFile(fileobj=data, mode="rb") as gzipobj: | ||
yield from self.inner_parser.parse(gzipobj) | ||
|
||
|
||
@dataclass | ||
class JsonLineParser(Parser): | ||
encoding: Optional[str] = "utf-8" | ||
|
||
def parse( | ||
self, | ||
data: BufferedIOBase, | ||
) -> Generator[MutableMapping[str, Any], None, None]: | ||
for line in data: | ||
try: | ||
yield json.loads(line.decode(encoding=self.encoding or "utf-8")) | ||
except json.JSONDecodeError as e: | ||
logger.warning(f"Cannot decode/parse line {line!r} as JSON, error: {e}") | ||
|
||
|
||
@dataclass | ||
class CsvParser(Parser): | ||
# TODO: migrate implementation to re-use file-base classes | ||
encoding: Optional[str] = "utf-8" | ||
delimiter: Optional[str] = "," | ||
|
||
def parse( | ||
self, | ||
data: BufferedIOBase, | ||
) -> Generator[MutableMapping[str, Any], None, None]: | ||
""" | ||
Parse CSV data from decompressed bytes. | ||
""" | ||
text_data = TextIOWrapper(data, encoding=self.encoding) # type: ignore | ||
reader = csv.DictReader(text_data, delimiter=self.delimiter or ",") | ||
yield from reader | ||
|
||
|
||
@dataclass | ||
class CompositeRawDecoder(Decoder): | ||
""" | ||
Decoder strategy to transform a requests.Response into a Generator[MutableMapping[str, Any], None, None] | ||
passed response.raw to parser(s). | ||
Note: response.raw is not decoded/decompressed by default. | ||
parsers should be instantiated recursively. | ||
Example: | ||
composite_raw_decoder = CompositeRawDecoder(parser=GzipParser(inner_parser=JsonLineParser(encoding="iso-8859-1"))) | ||
""" | ||
|
||
parser: Parser | ||
|
||
def is_stream_response(self) -> bool: | ||
return True | ||
|
||
def decode( | ||
self, response: requests.Response | ||
) -> Generator[MutableMapping[str, Any], None, None]: | ||
yield from self.parser.parse(data=response.raw) # type: ignore[arg-type] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.