diff --git a/hathor/serialization/__init__.py b/hathor/serialization/__init__.py new file mode 100644 index 000000000..65e1626a0 --- /dev/null +++ b/hathor/serialization/__init__.py @@ -0,0 +1,27 @@ +# Copyright 2025 Hathor Labs +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .deserializer import Deserializer +from .exceptions import BadDataError, OutOfDataError, SerializationError, TooLongError, UnsupportedTypeError +from .serializer import Serializer + +__all__ = [ + 'Serializer', + 'Deserializer', + 'SerializationError', + 'UnsupportedTypeError', + 'TooLongError', + 'OutOfDataError', + 'BadDataError', +] diff --git a/hathor/serialization/adapters/__init__.py b/hathor/serialization/adapters/__init__.py new file mode 100644 index 000000000..8667c7684 --- /dev/null +++ b/hathor/serialization/adapters/__init__.py @@ -0,0 +1,24 @@ +# Copyright 2025 Hathor Labs +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .generic_adapter import GenericDeserializerAdapter, GenericSerializerAdapter +from .max_bytes import MaxBytesDeserializer, MaxBytesExceededError, MaxBytesSerializer + +__all__ = [ + 'GenericDeserializerAdapter', + 'GenericSerializerAdapter', + 'MaxBytesDeserializer', + 'MaxBytesExceededError', + 'MaxBytesSerializer', +] diff --git a/hathor/serialization/adapters/generic_adapter.py b/hathor/serialization/adapters/generic_adapter.py new file mode 100644 index 000000000..9d7540bce --- /dev/null +++ b/hathor/serialization/adapters/generic_adapter.py @@ -0,0 +1,110 @@ +# Copyright 2025 Hathor Labs +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from types import TracebackType +from typing import Generic, TypeVar + +from typing_extensions import Self, override + +from hathor.serialization.deserializer import Deserializer +from hathor.serialization.serializer import Serializer + +from ..types import Buffer + +S = TypeVar('S', bound=Serializer) +D = TypeVar('D', bound=Deserializer) + + +class GenericSerializerAdapter(Serializer, Generic[S]): + inner: S + + def __init__(self, serializer: S) -> None: + self.inner = serializer + + @override + def finalize(self) -> Buffer: + return self.inner.finalize() + + @override + def cur_pos(self) -> int: + return self.inner.cur_pos() + + @override + def write_byte(self, data: int) -> None: + self.inner.write_byte(data) + + @override + def write_bytes(self, data: Buffer) -> None: + self.inner.write_bytes(data) + + # allow using this adapter as a context manager: + + def __enter__(self) -> Self: + return self + + def __exit__( + self, + exc_type: type[BaseException] | None, + exc_value: BaseException | None, + traceback: TracebackType | None, + ) -> None: + pass + + +class GenericDeserializerAdapter(Deserializer, Generic[D]): + inner: D + + def __init__(self, deserializer: D) -> None: + self.inner = deserializer + + @override + def finalize(self) -> None: + return self.inner.finalize() + + @override + def is_empty(self) -> bool: + return self.inner.is_empty() + + @override + def peek_byte(self) -> int: + return self.inner.peek_byte() + + @override + def peek_bytes(self, n: int, *, exact: bool = True) -> Buffer: + return self.inner.peek_bytes(n, exact=exact) + + @override + def read_byte(self) -> int: + return self.inner.read_byte() + + @override + def read_bytes(self, n: int, *, exact: bool = True) -> Buffer: + return self.inner.read_bytes(n, exact=exact) + + @override + def read_all(self) -> Buffer: + return self.inner.read_all() + + # allow using this adapter as a context manager: + + def __enter__(self) -> Self: + return self + + def __exit__( + self, + exc_type: type[BaseException] | None, + exc_value: BaseException | None, + traceback: TracebackType | None, + ) -> None: + pass diff --git a/hathor/serialization/adapters/max_bytes.py b/hathor/serialization/adapters/max_bytes.py new file mode 100644 index 000000000..e3cf0f455 --- /dev/null +++ b/hathor/serialization/adapters/max_bytes.py @@ -0,0 +1,91 @@ +# Copyright 2025 Hathor Labs +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import TypeVar + +from typing_extensions import override + +from hathor.serialization.deserializer import Deserializer +from hathor.serialization.exceptions import SerializationError +from hathor.serialization.serializer import Serializer + +from ..types import Buffer +from .generic_adapter import GenericDeserializerAdapter, GenericSerializerAdapter + +S = TypeVar('S', bound=Serializer) +D = TypeVar('D', bound=Deserializer) + + +class MaxBytesExceededError(SerializationError): + """ This error is raised when the adapted serializer reached its maximum bytes write/read. + + After this exception is raised the adapted serializer cannot be used anymore. Handlers of this exception are + expected to either: bubble up the exception (or an equivalente exception), or return an error. Handlers should not + try to write again on the same serializer. + + It is possible that the inner serializer is still usable, but the point where the serialized stopped writing or + reading might leave the rest of the data unusable, so for that reason it should be considered a failed + (de)serialization overall, and not simply a failed "read/write" operation. + """ + pass + + +class MaxBytesSerializer(GenericSerializerAdapter[S]): + def __init__(self, serializer: S, max_bytes: int) -> None: + super().__init__(serializer) + self._bytes_left = max_bytes + + def _check_update_exceeds(self, write_size: int) -> None: + self._bytes_left -= write_size + if self._bytes_left < 0: + raise MaxBytesExceededError + + @override + def write_byte(self, data: int) -> None: + self._check_update_exceeds(1) + super().write_byte(data) + + @override + def write_bytes(self, data: Buffer) -> None: + data_view = memoryview(data) + self._check_update_exceeds(len(data_view)) + super().write_bytes(data_view) + + +class MaxBytesDeserializer(GenericDeserializerAdapter[D]): + def __init__(self, deserializer: D, max_bytes: int) -> None: + super().__init__(deserializer) + self._bytes_left = max_bytes + + def _check_update_exceeds(self, read_size: int) -> None: + self._bytes_left -= read_size + if self._bytes_left < 0: + raise MaxBytesExceededError + + @override + def read_byte(self) -> int: + self._check_update_exceeds(1) + return super().read_byte() + + @override + def read_bytes(self, n: int, *, exact: bool = True) -> Buffer: + self._check_update_exceeds(n) + return super().read_bytes(n, exact=exact) + + @override + def read_all(self) -> Buffer: + result = super().read_bytes(self._bytes_left, exact=False) + if not self.is_empty(): + raise MaxBytesExceededError + return result diff --git a/hathor/serialization/bytes_deserializer.py b/hathor/serialization/bytes_deserializer.py new file mode 100644 index 000000000..1a26ec7b2 --- /dev/null +++ b/hathor/serialization/bytes_deserializer.py @@ -0,0 +1,76 @@ +# Copyright 2025 Hathor Labs +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing_extensions import override + +from .deserializer import Deserializer +from .exceptions import OutOfDataError +from .types import Buffer + +_EMPTY_VIEW = memoryview(b'') + + +class BytesDeserializer(Deserializer): + """Simple implementation of a Deserializer to parse values from a byte sequence. + + This implementation maintains a memoryview that is shortened as the bytes are read. + """ + + def __init__(self, data: Buffer) -> None: + self._view = memoryview(data) + + @override + def finalize(self) -> None: + if not self.is_empty(): + raise ValueError('trailing data') + del self._view + + @override + def is_empty(self) -> bool: + # XXX: least amount of OPs, "not" converts to bool with the correct semantics of "is empty" + return not self._view + + @override + def peek_byte(self) -> int: + if not len(self._view): + raise OutOfDataError('not enough bytes to read') + return self._view[0] + + @override + def peek_bytes(self, n: int, *, exact: bool = True) -> memoryview: + if n < 0: + raise ValueError('value cannot be negative') + if exact and len(self._view) < n: + raise OutOfDataError('not enough bytes to read') + return self._view[:n] + + @override + def read_byte(self) -> int: + b = self.peek_byte() + self._view = self._view[1:] + return b + + @override + def read_bytes(self, n: int, *, exact: bool = True) -> memoryview: + b = self.peek_bytes(n, exact=exact) + if exact and len(self._view) < n: + raise OutOfDataError('not enough bytes to read') + self._view = self._view[n:] + return b + + @override + def read_all(self) -> memoryview: + b = self._view + self._view = _EMPTY_VIEW + return b diff --git a/hathor/serialization/bytes_serializer.py b/hathor/serialization/bytes_serializer.py new file mode 100644 index 000000000..067e9920b --- /dev/null +++ b/hathor/serialization/bytes_serializer.py @@ -0,0 +1,53 @@ +# Copyright 2025 Hathor Labs +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing_extensions import override + +from .serializer import Serializer +from .types import Buffer + + +class BytesSerializer(Serializer): + """Simple implementation of Serializer to write to memory. + + This implementation defers joining everything until finalize is called, before that every write is stored as a + memoryview in a list. + """ + + def __init__(self) -> None: + self._parts: list[memoryview] = [] + self._pos: int = 0 + + @override + def finalize(self) -> memoryview: + result = memoryview(b''.join(self._parts)) + del self._parts + del self._pos + return result + + @override + def cur_pos(self) -> int: + return self._pos + + @override + def write_byte(self, data: int) -> None: + # int.to_bytes checks for correct range + self._parts.append(memoryview(int.to_bytes(data))) + self._pos += 1 + + @override + def write_bytes(self, data: Buffer) -> None: + part = memoryview(data) + self._parts.append(part) + self._pos += len(part) diff --git a/hathor/serialization/compound_encoding/__init__.py b/hathor/serialization/compound_encoding/__init__.py new file mode 100644 index 000000000..b2e44d889 --- /dev/null +++ b/hathor/serialization/compound_encoding/__init__.py @@ -0,0 +1,50 @@ +# Copyright 2025 Hathor Labs +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +This module was made to hold compound encoding implementations. + +Compound encoders are encoders that are generic in some way and will delegate the encoding of some portion to another +encoder. For example a `value: Optional[T]` encoder is prepared to encode the value and delegate the rest to an encoder +that knows how to encode `T`. + +The general organization should be that each submodule `x` deals with a single type and look like this: + + def encode_x(serializer: Serializer, value: ValueType, ...config params...) -> None: + ... + + def decode_x(deserializer: Deserializer, ...config params...) -> ValueType: + ... + +The "config params" are optional and specific to each encoder. Submodules should not have to take into consideration +how types are mapped to encoders. +""" + +from typing import Protocol, TypeVar + +from hathor.serialization.deserializer import Deserializer +from hathor.serialization.serializer import Serializer + +T_co = TypeVar('T_co', covariant=True) +T_contra = TypeVar('T_contra', contravariant=True) + + +class Decoder(Protocol[T_co]): + def __call__(self, deserializer: Deserializer, /) -> T_co: + ... + + +class Encoder(Protocol[T_contra]): + def __call__(self, serializer: Serializer, value: T_contra, /) -> None: + ... diff --git a/hathor/serialization/compound_encoding/collection.py b/hathor/serialization/compound_encoding/collection.py new file mode 100644 index 000000000..dc534fa8a --- /dev/null +++ b/hathor/serialization/compound_encoding/collection.py @@ -0,0 +1,64 @@ +# Copyright 2025 Hathor Labs +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +r""" +A collection is basically any value that has a known size and is iterable. + +Layout: [N: unsigned leb128][value_0]...[value_N] + +>>> from hathor.serialization.encoding.utf8 import encode_utf8, decode_utf8 +>>> se = Serializer.build_bytes_serializer() +>>> value = ['foobar', 'π', '😎', 'test'] +>>> encode_collection(se, value, encode_utf8) +>>> bytes(se.finalize()).hex() +'0406666f6f62617202cf8004f09f988e0474657374' + +Breakdown of the result: + + 04: 4 in leb128, the total length + 06666f6f626172: 'foobar' with length prefix) + 02cf80: 'π' (with length prefix) + 04f09f988e: '😎' (with length prefix) + 0474657374: 'test' (with length prefix) + +When decoding, the builder can be any compabile collection, in the previous example a `list` was encoded, but when +decoding a `tuple` could be used, it only matters that the collection can be initialized with an `Iterable[T]`. + +>>> de = Deserializer.build_bytes_deserializer(bytes.fromhex('0406666f6f62617202cf8004f09f988e0474657374')) +>>> decode_collection(de, decode_utf8, tuple) +('foobar', 'π', '😎', 'test') +>>> de.finalize() +""" + +from collections.abc import Collection, Iterable +from typing import Callable, TypeVar + +from hathor.serialization import Deserializer, Serializer +from hathor.serialization.encoding.leb128 import decode_leb128, encode_leb128 + +from . import Decoder, Encoder + +T = TypeVar('T') +R = TypeVar('R', bound=Collection) + + +def encode_collection(serializer: Serializer, values: Collection[T], encoder: Encoder[T]) -> None: + encode_leb128(serializer, len(values), signed=False) + for value in values: + encoder(serializer, value) + + +def decode_collection(deserializer: Deserializer, decoder: Decoder[T], builder: Callable[[Iterable[T]], R]) -> R: + length = decode_leb128(deserializer, signed=False) + return builder(decoder(deserializer) for _ in range(length)) diff --git a/hathor/serialization/compound_encoding/mapping.py b/hathor/serialization/compound_encoding/mapping.py new file mode 100644 index 000000000..31eb74c4a --- /dev/null +++ b/hathor/serialization/compound_encoding/mapping.py @@ -0,0 +1,86 @@ +# Copyright 2025 Hathor Labs +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +r""" +Encoding a mapping is equivalent to encoding a collection of 2-tuples. + +Layout: [N: unsigned leb128][key_0][value_0]...[key_N][value_N] + +>>> from hathor.serialization.encoding.utf8 import encode_utf8, decode_utf8 +>>> from hathor.serialization.encoding.bool import encode_bool, decode_bool +>>> se = Serializer.build_bytes_serializer() +>>> value = { +... 'foo': False, +... 'bar': True, +... 'foobar': True, +... 'baz': False, +... } +>>> encode_mapping(se, value, encode_utf8, encode_bool) +>>> bytes(se.finalize()).hex() +'0403666f6f00036261720106666f6f626172010362617a00' + +Breakdown of the result: + + 04: 4 in leb128, the total length + 03666f6f: 'foo' with length prefix + 00: False + 03626172: 'bar' with length prefix + 01: True + 06666f6f626172: 'foobar' with length prefix + 01: True + 0362617a: 'baz' with length prefix + 00: False + +>>> de = Deserializer.build_bytes_deserializer(bytes.fromhex('0403666f6f00036261720106666f6f626172010362617a00')) +>>> decode_mapping(de, decode_utf8, decode_bool, dict) +{'foo': False, 'bar': True, 'foobar': True, 'baz': False} +>>> de.finalize() +""" + +from collections.abc import Iterable, Mapping +from typing import Callable, TypeVar + +from hathor.serialization import Deserializer, Serializer +from hathor.serialization.encoding.leb128 import decode_leb128, encode_leb128 + +from . import Decoder, Encoder + +KT = TypeVar('KT') +VT = TypeVar('VT') +R = TypeVar('R', bound=Mapping) + + +def encode_mapping( + serializer: Serializer, + values_mapping: Mapping[KT, VT], + key_encoder: Encoder[KT], + value_encoder: Encoder[VT], +) -> None: + encode_leb128(serializer, len(values_mapping), signed=False) + for key, value in values_mapping.items(): + key_encoder(serializer, key) + value_encoder(serializer, value) + + +def decode_mapping( + deserializer: Deserializer, + key_decoder: Decoder[KT], + value_decoder: Decoder[VT], + mapping_builder: Callable[[Iterable[tuple[KT, VT]]], R], +) -> R: + size = decode_leb128(deserializer, signed=False) + return mapping_builder( + (key_decoder(deserializer), value_decoder(deserializer)) + for _ in range(size) + ) diff --git a/hathor/serialization/compound_encoding/optional.py b/hathor/serialization/compound_encoding/optional.py new file mode 100644 index 000000000..11c5aa8eb --- /dev/null +++ b/hathor/serialization/compound_encoding/optional.py @@ -0,0 +1,68 @@ +# Copyright 2025 Hathor Labs +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +r""" +An optional type is encoded the same way as a collection with max length of 1. + +Layout: + + [0x00] when None + [0x01][value] when not None + +>>> from hathor.serialization.encoding.utf8 import encode_utf8, decode_utf8 +>>> se = Serializer.build_bytes_serializer() +>>> encode_optional(se, 'foobar', encode_utf8) +>>> bytes(se.finalize()).hex() +'0106666f6f626172' + +>>> se = Serializer.build_bytes_serializer() +>>> encode_optional(se, None, encode_utf8) +>>> bytes(se.finalize()).hex() +'00' + +>>> de = Deserializer.build_bytes_deserializer(bytes.fromhex('0106666f6f626172')) +>>> decode_optional(de, decode_utf8) +'foobar' +>>> de.finalize() + +>>> de = Deserializer.build_bytes_deserializer(bytes.fromhex('00')) +>>> str(decode_optional(de, decode_utf8)) +'None' +>>> de.finalize() +""" + +from typing import Optional, TypeVar + +from hathor.serialization import Deserializer, Serializer +from hathor.serialization.encoding.bool import decode_bool, encode_bool + +from . import Decoder, Encoder + +T = TypeVar('T') + + +def encode_optional(serializer: Serializer, value: Optional[T], encoder: Encoder[T]) -> None: + if value is None: + encode_bool(serializer, False) + else: + encode_bool(serializer, True) + encoder(serializer, value) + + +def decode_optional(deserializer: Deserializer, decoder: Decoder[T]) -> Optional[T]: + has_value = decode_bool(deserializer) + if has_value: + return decoder(deserializer) + else: + return None diff --git a/hathor/serialization/compound_encoding/tuple.py b/hathor/serialization/compound_encoding/tuple.py new file mode 100644 index 000000000..627f2c48b --- /dev/null +++ b/hathor/serialization/compound_encoding/tuple.py @@ -0,0 +1,66 @@ +# Copyright 2025 Hathor Labs +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +r""" +In Python a tuple type can be used in annotations in 2 different ways: + +1. `tuple[A, B, C]`: known fixed length and heterogeneous types +2. `tuple[X, ...]`: variable length and homogeneous type + +This module only implements encoding of the first case, the second case can be encoded using the collection encoder. + +There actually isn't a "format" per-se, the encoding of `tuple[A, B, C]` is just the encoding of A concatenated with B +concatenated with C. So this compound encoder is basically a shortcut that can be used by cases that already have a +tuple of values and a matching tuple of encoders of those values. + +>>> from hathor.serialization.encoding.utf8 import encode_utf8, decode_utf8 +>>> from hathor.serialization.encoding.bool import encode_bool, decode_bool +>>> from hathor.serialization.encoding.bytes import decode_bytes, encode_bytes +>>> se = Serializer.build_bytes_serializer() +>>> values = ('foobar', False, b'test') +>>> encode_tuple(se, values, (encode_utf8, encode_bool, encode_bytes)) +>>> bytes(se.finalize()).hex() +'06666f6f626172000474657374' + +Breakdown of the result: + + 06666f6f626172: 'foobar' + 00: False + 0474657374: b'test' + +>>> de = Deserializer.build_bytes_deserializer(bytes.fromhex('06666f6f626172000474657374')) +>>> decode_tuple(de, (decode_utf8, decode_bool, decode_bytes)) +('foobar', False, b'test') +""" + +from typing import Any + +from typing_extensions import TypeVarTuple, Unpack + +from hathor.serialization import Deserializer, Serializer + +from . import Decoder, Encoder + +Ts = TypeVarTuple('Ts') + + +def encode_tuple(serializer: Serializer, values: tuple[Unpack[Ts]], encoders: tuple[Encoder[Any], ...]) -> None: + assert len(values) == len(encoders) + # mypy can't track tuple element-wise mapping yet — safe due to length check above + for value, encoder in zip(values, encoders): # type: ignore + encoder(serializer, value) + + +def decode_tuple(deserializer: Deserializer, decoders: tuple[Decoder[Any], ...]) -> tuple[Unpack[Ts]]: + return tuple(decoder(deserializer) for decoder in decoders) diff --git a/hathor/serialization/consts.py b/hathor/serialization/consts.py new file mode 100644 index 000000000..13888d7bf --- /dev/null +++ b/hathor/serialization/consts.py @@ -0,0 +1,16 @@ +# Copyright 2025 Hathor Labs +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +DEFAULT_LEB128_MAX_BYTES: int = 4 +DEFAULT_BYTES_MAX_LENGTH: int = 2**16 # 64KiB diff --git a/hathor/serialization/deserializer.py b/hathor/serialization/deserializer.py new file mode 100644 index 000000000..0fcecaf42 --- /dev/null +++ b/hathor/serialization/deserializer.py @@ -0,0 +1,109 @@ +# Copyright 2025 Hathor Labs +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +import struct +from abc import ABC, abstractmethod +from typing import TYPE_CHECKING, Any, TypeVar, overload + +from typing_extensions import Self + +from .types import Buffer + +if TYPE_CHECKING: + from .adapters import MaxBytesDeserializer + from .bytes_deserializer import BytesDeserializer + +T = TypeVar('T') + + +class Deserializer(ABC): + def finalize(self) -> None: + """Check that all bytes were consumed, the deserializer cannot be used after this.""" + raise TypeError('this deserializer does not support finalization') + + @staticmethod + def build_bytes_deserializer(data: Buffer) -> BytesDeserializer: + from .bytes_deserializer import BytesDeserializer + return BytesDeserializer(data) + + @abstractmethod + def is_empty(self) -> bool: + raise NotImplementedError + + @abstractmethod + def peek_byte(self) -> int: + """Read a single byte but don't consume from buffer.""" + raise NotImplementedError + + @abstractmethod + def peek_bytes(self, n: int, *, exact: bool = True) -> Buffer: + """Read n single byte but don't consume from buffer.""" + raise NotImplementedError + + def peek_struct(self, format: str) -> tuple[Any, ...]: + size = struct.calcsize(format) + data = self.peek_bytes(size) + return struct.unpack(format, data) + + @abstractmethod + def read_byte(self) -> int: + """Read a single byte as unsigned int.""" + raise NotImplementedError + + @abstractmethod + def read_bytes(self, n: int, *, exact: bool = True) -> Buffer: + """Read n bytes, when exact=True it errors if there isn't enough data""" + # XXX: this is a blanket implementation that is an example of the behavior, this implementation has to be + # explicitly used if needed + def iter_bytes(): + for _ in range(n): + if not exact and self.is_empty(): + break + yield self.read_byte() + return bytes(iter_bytes()) + + @abstractmethod + def read_all(self) -> Buffer: + """Read all bytes until the reader is empty.""" + # XXX: it is recommended that implementors of Deserializer specialize this implementation + def iter_bytes(): + while not self.is_empty(): + yield self.read_byte() + return bytes(iter_bytes()) + + def read_struct(self, format: str) -> tuple[Any, ...]: + size = struct.calcsize(format) + data = self.read_bytes(size) + return struct.unpack_from(format, data) + + def with_max_bytes(self, max_bytes: int) -> MaxBytesDeserializer[Self]: + """Helper method to wrap the current deserializer with MaxBytesDeserializer.""" + from .adapters import MaxBytesDeserializer + return MaxBytesDeserializer(self, max_bytes) + + @overload + def with_optional_max_bytes(self, max_bytes: None) -> Self: + ... + + @overload + def with_optional_max_bytes(self, max_bytes: int) -> MaxBytesDeserializer[Self]: + ... + + def with_optional_max_bytes(self, max_bytes: int | None) -> Self | MaxBytesDeserializer[Self]: + """Helper method to optionally wrap the current deserializer.""" + if max_bytes is None: + return self + return self.with_max_bytes(max_bytes) diff --git a/hathor/serialization/encoding/__init__.py b/hathor/serialization/encoding/__init__.py new file mode 100644 index 000000000..11f3b0954 --- /dev/null +++ b/hathor/serialization/encoding/__init__.py @@ -0,0 +1,32 @@ +# Copyright 2025 Hathor Labs +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +This module was made to hold simple encoding implementations. + +Simple in this context means "not compound". For example a fixed-size int encoding can have sized/signed parameters, +but not a have a generic function or type as a parameter. For compound types (optionals, lists, dicts, ...) the encoder +should be in the `encoding_compound` module. + +The general organization should be that each submodule `x` deals with a single type and look like this: + + def encode_x(serializer: Serializer, value: ValueType, ...config params...) -> None: + ... + + def decode_x(deserializer: Deserializer, ...config params...) -> ValueType: + ... + +The "config params" are optional and specific to each encoder. Submodules should not have to take into consideration +how types are mapped to encoders. +""" diff --git a/hathor/serialization/encoding/bool.py b/hathor/serialization/encoding/bool.py new file mode 100644 index 000000000..878cbde03 --- /dev/null +++ b/hathor/serialization/encoding/bool.py @@ -0,0 +1,78 @@ +# Copyright 2025 Hathor Labs +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +r""" +This module implements encoding a boolean value using 1 byte. + +The format is trivial and extremely simple: + +- `False` maps to `b'\x00'` +- `True` maps to `b'\x01'` +- any other byte value is invalid + +>>> se = Serializer.build_bytes_serializer() +>>> encode_bool(se, False) +>>> bytes(se.finalize()) +b'\x00' + +>>> se = Serializer.build_bytes_serializer() +>>> encode_bool(se, True) +>>> bytes(se.finalize()) +b'\x01' + +>>> de = Deserializer.build_bytes_deserializer(b'\x00') +>>> decode_bool(de) +False +>>> de.finalize() + +>>> de = Deserializer.build_bytes_deserializer(b'\x01') +>>> decode_bool(de) +True +>>> de.finalize() + +>>> de = Deserializer.build_bytes_deserializer(b'\x02') +>>> try: +... decode_bool(de) +... except ValueError as e: +... print(*e.args) +b'\x02' is not a valid boolean + +>>> de = Deserializer.build_bytes_deserializer(b'\x01test') +>>> decode_bool(de) +True +>>> bytes(de.read_all()) +b'test' +""" + +from hathor.serialization import Deserializer, Serializer + + +def encode_bool(serializer: Serializer, value: bool) -> None: + """ Encodes a boolean value using 1 byte. + """ + assert isinstance(value, bool) + serializer.write_byte(0x01 if value else 0x00) + + +def decode_bool(deserializer: Deserializer) -> bool: + """ Decodes a boolean value from 1 byte. + """ + i = deserializer.read_byte() + if i == 0: + return False + elif i == 1: + return True + else: + raw = bytes([i]) + raise ValueError(f'{raw!r} is not a valid boolean') diff --git a/hathor/serialization/encoding/bytes.py b/hathor/serialization/encoding/bytes.py new file mode 100644 index 000000000..c61eb9e10 --- /dev/null +++ b/hathor/serialization/encoding/bytes.py @@ -0,0 +1,83 @@ +# Copyright 2025 Hathor Labs +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +r""" +This modules implements encoding of byte sequence by prefixing it with the length of the sequence encoded as a LEB128 +unsigned integer. + +>>> se = Serializer.build_bytes_serializer() +>>> encode_bytes(se, b'test') # will prepend b'\x04' before writing b'test' +>>> bytes(se.finalize()).hex() +'0474657374' + +>>> se = Serializer.build_bytes_serializer() +>>> raw_data = b'test' * 32 +>>> len(raw_data) +128 +>>> encode_bytes(se, raw_data) # prepends b'\x80\x01' before raw_data +>>> encoded_data = bytes(se.finalize()) +>>> len(encoded_data) +130 +>>> encoded_data[:10].hex() +'80017465737474657374' + +>>> de = Deserializer.build_bytes_deserializer(encoded_data) # that we encoded before +>>> decoded_data = decode_bytes(de) +>>> de.finalize() # called to assert we've consumed everything +>>> decoded_data == raw_data +True +>>> decoded_data[:8] +b'testtest' + +>>> de = Deserializer.build_bytes_deserializer(b'\x04test') +>>> decode_bytes(de) +b'test' +>>> de.finalize() + +>>> de = Deserializer.build_bytes_deserializer(b'\x04testfoo') +>>> _ = decode_bytes(de) +>>> try: +... de.finalize() +... except ValueError as e: +... print(*e.args) +trailing data + +>>> de = Deserializer.build_bytes_deserializer(b'\x04testfoo') +>>> _ = decode_bytes(de) +>>> bytes(de.read_all()) +b'foo' +""" + +from hathor.serialization import Deserializer, Serializer + +from .leb128 import decode_leb128, encode_leb128 + + +def encode_bytes(serializer: Serializer, data: bytes) -> None: + """ Encodes a byte-sequence adding a length prefix. + + This modules's docstring has more details and examples. + """ + assert isinstance(data, bytes) + encode_leb128(serializer, len(data), signed=False) + serializer.write_bytes(data) + + +def decode_bytes(deserializer: Deserializer) -> bytes: + """ Decodes a byte-sequnce with a length prefix. + + This modules's docstring has more details and examples. + """ + size = decode_leb128(deserializer, signed=False) + return bytes(deserializer.read_bytes(size)) diff --git a/hathor/serialization/encoding/int.py b/hathor/serialization/encoding/int.py new file mode 100644 index 000000000..593eebe31 --- /dev/null +++ b/hathor/serialization/encoding/int.py @@ -0,0 +1,60 @@ +# Copyright 2025 Hathor Labs +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +This module implements encoding of integers with a fixed size, the size and signedness are parametrized. + +The encoding format itself is a standard big-endian format. + +>>> se = Serializer.build_bytes_serializer() +>>> encode_int(se, 0, length=1, signed=True) # writes 00 +>>> encode_int(se, 255, length=1, signed=False) # writes ff +>>> encode_int(se, 1234, length=2, signed=True) # writes 04d2 +>>> encode_int(se, -1234, length=2, signed=True) # writes fb2e +>>> bytes(se.finalize()).hex() +'00ff04d2fb2e' + +>>> de = Deserializer.build_bytes_deserializer(bytes.fromhex('00ff04d2fb2e')) +>>> decode_int(de, length=1, signed=True) # reads 00 +0 +>>> decode_int(de, length=1, signed=False) # reads ff +255 +>>> decode_int(de, length=2, signed=True) # reads 04d2 +1234 +>>> decode_int(de, length=2, signed=True) # reads fb2e +-1234 +""" + +from hathor.serialization import Deserializer, Serializer + + +def encode_int(serializer: Serializer, number: int, *, length: int, signed: bool) -> None: + """ Encode an int using the given byte-length and signedness. + + This modules's docstring has more details and examples. + """ + try: + data = int.to_bytes(number, length, byteorder='big', signed=signed) + except OverflowError: + raise ValueError('too big to encode') + serializer.write_bytes(data) + + +def decode_int(deserializer: Deserializer, *, length: int, signed: bool) -> int: + """ Decode an int using the given byte-length and signedness. + + This modules's docstring has more details and examples. + """ + data = deserializer.read_bytes(length) + return int.from_bytes(data, byteorder='big', signed=signed) diff --git a/hathor/serialization/encoding/leb128.py b/hathor/serialization/encoding/leb128.py new file mode 100644 index 000000000..b4399b052 --- /dev/null +++ b/hathor/serialization/encoding/leb128.py @@ -0,0 +1,93 @@ +# Copyright 2025 Hathor Labs +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +This module implements LEB128 for signed integers. + +LEB128 or Little Endian Base 128 is a variable-length code compression used to store arbitrarily large +integers in a small number of bytes. LEB128 is used in the DWARF debug file format and the WebAssembly +binary encoding for all integer literals. + +References: +- https://en.wikipedia.org/wiki/LEB128 +- https://dwarfstd.org/doc/DWARF5.pdf +- https://webassembly.github.io/spec/core/binary/values.html#integers + +This module implements LEB128 encoding/decoding using the standard 1-byte block split into 1-bit for continuation and +7-bits for data. The data can be either a signed or unsigned integer. + +>>> se = Serializer.build_bytes_serializer() +>>> se.write_bytes(b'test') # writes 74657374 +>>> encode_leb128(se, 0, signed=True) # writes 00 +>>> encode_leb128(se, 624485, signed=True) # writes e58e26 +>>> encode_leb128(se, -123456, signed=True) # writes c0bb78 +>>> bytes(se.finalize()).hex() +'7465737400e58e26c0bb78' + +>>> data = bytes.fromhex('00 e58e26 c0bb78 74657374') +>>> de = Deserializer.build_bytes_deserializer(data) +>>> decode_leb128(de, signed=True) # reads 00 +0 +>>> decode_leb128(de, signed=True) # reads e58e26 +624485 +>>> decode_leb128(de, signed=True) # reads c0bb78 +-123456 +>>> bytes(de.read_all()) # reads 74657374 +b'test' +>>> de.finalize() +""" + +from hathor.serialization import Deserializer, Serializer + + +def encode_leb128(serializer: Serializer, value: int, *, signed: bool) -> None: + """ Encodes an integer using LEB128. + + Caller must explicitly choose `signed=True` or `signed=False`. + + This module's docstring has more details on LEB128 and examples. + """ + if not signed and value < 0: + raise ValueError('cannot encode value <0 as unsigend') + while True: + byte = value & 0b0111_1111 + value >>= 7 + if signed: + cont = (value == 0 and (byte & 0b0100_0000) == 0) or (value == -1 and (byte & 0b0100_0000) != 0) + else: + cont = (value == 0 and (byte & 0b1000_0000) == 0) + if cont: + serializer.write_byte(byte) + break + serializer.write_byte(byte | 0b1000_0000) + + +def decode_leb128(deserializer: Deserializer, *, signed: bool) -> int: + """ Decodes a LEB128-encoded integer. + + Caller must explicitly choose `signed=True` or `signed=False`. + + This module's docstring has more details on LEB128 and examples. + """ + result = 0 + shift = 0 + while True: + byte = deserializer.read_byte() + result |= (byte & 0b0111_1111) << shift + shift += 7 + assert shift % 7 == 0 + if (byte & 0b1000_0000) == 0: + if signed and (byte & 0b0100_0000) != 0: + return result | -(1 << shift) + return result diff --git a/hathor/serialization/encoding/output_value.py b/hathor/serialization/encoding/output_value.py new file mode 100644 index 000000000..fab67de4e --- /dev/null +++ b/hathor/serialization/encoding/output_value.py @@ -0,0 +1,127 @@ +# Copyright 2025 Hathor Labs +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +r""" +This module implements our custom output-value encoding for integers. + +Our custom encoding format consists of the following: + +- if value <= 2**31 - 1, use 4-bytes and encode it as a signed positive integer +- if value > 2**31 - 1, use 8-bytes and encode it as a signed negative integer + +When decoding, we peek at the first byte to determine the sign and whether read 4 or 8 bytes. + +Examples: + +>>> se = Serializer.build_bytes_serializer() +>>> try: +... encode_output_value(se, 0) +... except ValueError as e: +... print(*e.args) +Number must be strictly positive + +>>> try: +... encode_output_value(se, -1) +... except ValueError as e: +... print(*e.args) +Number must not be negative + +>>> se = Serializer.build_bytes_serializer() +>>> encode_output_value(se, 0, strict=False) # writes 00000000 +>>> encode_output_value(se, 100) # writes 00000064 +>>> encode_output_value(se, 2 ** 31 - 1) # writes 7fffffff +>>> encode_output_value(se, 2 ** 31) # writes ffffffff80000000 +>>> encode_output_value(se, 2 ** 63) # writes 8000000000000000 +>>> bytes(se.finalize()).hex() +'00000000000000647fffffffffffffff800000008000000000000000' + +>>> se = Serializer.build_bytes_serializer() +>>> try: +... encode_output_value(se, 2 ** 63 + 1) +... except ValueError as e: +... print(*e.args) +Number is too big; max possible value is 2**63, got: 9223372036854775809 + +>>> de = Deserializer.build_bytes_deserializer(b'\x00\x00\x00\x00') +>>> try: +... decode_output_value(de) +... except ValueError as e: +... print(*e.args) +Number must be strictly positive + +>>> data = bytes.fromhex('00000000000000647fffffffffffffff800000008000000000000000') + b'test' +>>> de = Deserializer.build_bytes_deserializer(data) +>>> decode_output_value(de, strict=False) # reads 00000000 +0 +>>> decode_output_value(de) # reads 00000064 +100 +>>> decode_output_value(de) # reads 7fffffff +2147483647 +>>> decode_output_value(de) # reads ffffffff80000000 +2147483648 +>>> decode_output_value(de) # reads 8000000000000000 +9223372036854775808 +>>> bytes(de.read_all()) +b'test' +>>> de.finalize() +""" + +import struct + +from hathor.serialization import Deserializer, Serializer +from hathor.serialization.exceptions import BadDataError + +MAX_OUTPUT_VALUE_32 = 2 ** 31 - 1 # max value (inclusive) before having to use 8 bytes: 2_147_483_647 +MAX_OUTPUT_VALUE_64 = 2 ** 63 # max value (inclusive) that can be encoded (with 8 bytes): 9_223_372_036_854_775_808 + + +def encode_output_value(serializer: Serializer, number: int, *, strict: bool = True) -> None: + """ Encodes either 4 or 8 bytes using our output-value format. + + This modules's docstring has more details and examples. + """ + assert isinstance(number, int) + if number < 0: + raise ValueError('Number must not be negative') + if strict and number == 0: + raise ValueError('Number must be strictly positive') + if number > MAX_OUTPUT_VALUE_64: + raise ValueError(f'Number is too big; max possible value is 2**63, got: {number}') + # XXX: `signed` makes no difference, but oh well + if number > MAX_OUTPUT_VALUE_32: + serializer.write_bytes((-number).to_bytes(8, byteorder='big', signed=True)) + else: + serializer.write_bytes(number.to_bytes(4, byteorder='big', signed=True)) + + +def decode_output_value(deserializer: Deserializer, *, strict: bool = True) -> int: + """ Decodes either 4 or 8 bytes using our output-value format. + + This modules's docstring has more details and examples. + """ + value_high_byte, = deserializer.peek_struct('!b') + try: + if value_high_byte < 0: + raw_value, = deserializer.read_struct('!q') + value = -raw_value + else: + value, = deserializer.read_struct('!i') + except struct.error as e: + raise BadDataError('Invalid byte struct for output') from e + assert value >= 0 + if strict and value == 0: + raise ValueError('Number must be strictly positive') + if value < MAX_OUTPUT_VALUE_32 and value_high_byte < 0: + raise ValueError('Value fits in 4 bytes but is using 8 bytes') + return value diff --git a/hathor/serialization/encoding/utf8.py b/hathor/serialization/encoding/utf8.py new file mode 100644 index 000000000..d30c1e506 --- /dev/null +++ b/hathor/serialization/encoding/utf8.py @@ -0,0 +1,58 @@ +# Copyright 2025 Hathor Labs +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +r""" +This module implements utf-8 string encoding with a length prefix. + +It works exactly like bytes-encoding but the encoded byte-sequence is utf-8 and it takes/returns a `str`. + +>>> se = Serializer.build_bytes_serializer() +>>> encode_utf8(se, 'foobar') # writes 06666f6f626172 +>>> encode_utf8(se, 'ハトホル') # writes 0ce3838fe38388e3839be383ab +>>> encode_utf8(se, '😎') # writes 04f09f988e +>>> bytes(se.finalize()).hex() +'06666f6f6261720ce3838fe38388e3839be383ab04f09f988e' + +>>> de = Deserializer.build_bytes_deserializer(bytes.fromhex('06666f6f6261720ce3838fe38388e3839be383ab04f09f988e')) +>>> decode_utf8(de) # reads 06666f6f626172 +'foobar' +>>> decode_utf8(de) # reads 0ce3838fe38388e3839be383ab +'ハトホル' +>>> decode_utf8(de) # reads 04f09f988e +'😎' +>>> de.finalize() +""" + +from hathor.serialization import Deserializer, Serializer + +from .bytes import decode_bytes, encode_bytes + + +def encode_utf8(serializer: Serializer, value: str) -> None: + """ Encodes a string using UTF-8 and adding a length prefix. + + This modules's docstring has more details and examples. + """ + assert isinstance(value, str) + data = value.encode('utf-8') + encode_bytes(serializer, data) + + +def decode_utf8(deserializer: Deserializer) -> str: + """ Decodes a UTF-8 string with a length prefix. + + This modules's docstring has more details and examples. + """ + data = decode_bytes(deserializer) + return data.decode('utf-8') diff --git a/hathor/serialization/exceptions.py b/hathor/serialization/exceptions.py new file mode 100644 index 000000000..ae69396d9 --- /dev/null +++ b/hathor/serialization/exceptions.py @@ -0,0 +1,37 @@ +# Copyright 2025 Hathor Labs +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import struct + +from hathor.exception import HathorError + + +class SerializationError(HathorError): + pass + + +class UnsupportedTypeError(SerializationError): + pass + + +class TooLongError(SerializationError): + pass + + +class OutOfDataError(SerializationError, struct.error): + pass + + +class BadDataError(SerializationError): + pass diff --git a/hathor/serialization/serializer.py b/hathor/serialization/serializer.py new file mode 100644 index 000000000..46d4135e5 --- /dev/null +++ b/hathor/serialization/serializer.py @@ -0,0 +1,78 @@ +# Copyright 2025 Hathor Labs +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +import struct +from abc import ABC, abstractmethod +from typing import TYPE_CHECKING, Any, TypeVar, overload + +from typing_extensions import Self + +from .types import Buffer + +if TYPE_CHECKING: + from .adapters import MaxBytesSerializer + from .bytes_serializer import BytesSerializer + +T = TypeVar('T') + + +class Serializer(ABC): + def finalize(self) -> Buffer: + """Get the resulting byte sequence, the serializer cannot be reused after this.""" + raise TypeError('this serializer does not support finalization') + + @abstractmethod + def cur_pos(self) -> int: + raise NotImplementedError + + @abstractmethod + def write_byte(self, data: int) -> None: + """Write a single byte.""" + raise NotImplementedError + + @abstractmethod + def write_bytes(self, data: Buffer) -> None: + # XXX: it is recommended that implementors of Serializer specialize this implementation + for byte in bytes(memoryview(data)): + self.write_byte(byte) + + def write_struct(self, data: tuple[Any, ...], format: str) -> None: + data_bytes = struct.pack(format, *data) + self.write_bytes(data_bytes) + + def with_max_bytes(self, max_bytes: int) -> MaxBytesSerializer[Self]: + """Helper method to wrap the current serializer with MaxBytesSerializer.""" + from .adapters import MaxBytesSerializer + return MaxBytesSerializer(self, max_bytes) + + @overload + def with_optional_max_bytes(self, max_bytes: None) -> Self: + ... + + @overload + def with_optional_max_bytes(self, max_bytes: int) -> MaxBytesSerializer[Self]: + ... + + def with_optional_max_bytes(self, max_bytes: int | None) -> Self | MaxBytesSerializer[Self]: + """Helper method to optionally wrap the current serializer.""" + if max_bytes is None: + return self + return self.with_max_bytes(max_bytes) + + @staticmethod + def build_bytes_serializer() -> BytesSerializer: + from .bytes_serializer import BytesSerializer + return BytesSerializer() diff --git a/hathor/serialization/types.py b/hathor/serialization/types.py new file mode 100644 index 000000000..e37136d9a --- /dev/null +++ b/hathor/serialization/types.py @@ -0,0 +1,17 @@ +# Copyright 2025 Hathor Labs +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import TypeAlias + +Buffer: TypeAlias = bytes | memoryview