Skip to content

Commit c807bef

Browse files
author
aandres
committed
Add replay
1 parent 75d21ce commit c807bef

File tree

6 files changed

+543
-4
lines changed

6 files changed

+543
-4
lines changed

README.md

+1-1
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313
# Beavers
1414
_____
1515

16-
**Beavers** is a python library for stream processing, optimize for analytics.
16+
**Beavers** is a python library for stream processing, optimized for analytics.
1717

1818

1919
It is used at [Tradewell Technologies](https://www.tradewelltech.co/),

beavers/replay.py

+332
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,332 @@
1+
"""Module for replaying historical data."""
2+
import abc
3+
import dataclasses
4+
import logging
5+
from typing import Callable, Generic, Iterator, Optional, Protocol, TypeVar
6+
7+
import pandas as pd
8+
9+
from beavers.engine import UTC_MAX, Dag, Node
10+
11+
logger = logging.getLogger(__name__)
12+
13+
T = TypeVar("T")
14+
15+
16+
@dataclasses.dataclass(frozen=True)
17+
class ReplayContext:
18+
"""
19+
Stores the information about a replay.
20+
21+
Attributes
22+
----------
23+
start: pd.Timestamp
24+
Start of the replay
25+
end: pd.Timestamp
26+
End of the replay.
27+
This is exclusive, the replay will stop 1ns before
28+
frequency:
29+
How often should the replay run
30+
31+
"""
32+
33+
start: pd.Timestamp
34+
end: pd.Timestamp
35+
frequency: pd.Timedelta
36+
37+
38+
class DataSource(abc.ABC, Generic[T]):
39+
"""Interface for replaying historical data from a file or database."""
40+
41+
@abc.abstractmethod
42+
def read_to(self, timestamp: pd.Timestamp) -> T:
43+
"""
44+
Read from the data source, all the way to the provided timestamp (inclusive).
45+
46+
This function is stateful and must remember the previous timestamp
47+
for which data was read.
48+
49+
Parameters
50+
----------
51+
timestamp
52+
End of the time interval for which data is required (inclusive)
53+
54+
Returns
55+
-------
56+
data
57+
The data for the interval (or empty if no data is found)
58+
59+
"""
60+
61+
@abc.abstractmethod
62+
def get_next(self) -> pd.Timestamp:
63+
"""
64+
Return the next timestamp for which there is data.
65+
66+
If no data is available this should return `UTC_MAX`
67+
68+
69+
Returns
70+
-------
71+
timestamp: pd.Timestamp
72+
Timestamp of the next available data point (or `UTC_MAX` if no more data
73+
is available)
74+
75+
"""
76+
77+
78+
class DataSink(abc.ABC, Generic[T]):
79+
"""Interface for saving the results of a replay to a file or database."""
80+
81+
@abc.abstractmethod
82+
def append(self, timestamp: pd.Timestamp, data: T):
83+
"""
84+
Append data for the current cycle.
85+
86+
Parameters
87+
----------
88+
timestamp:
89+
End of the time interval for which data was replayed (inclusive)
90+
data:
91+
The generated data
92+
93+
"""
94+
95+
@abc.abstractmethod
96+
def close(self):
97+
"""Flush the data and clean up resources."""
98+
99+
100+
class DataSourceProvider(Protocol[T]):
101+
"""Interface for the provision of `DataSource`."""
102+
103+
@abc.abstractmethod
104+
def __call__(self, context: ReplayContext) -> DataSource[T]:
105+
"""
106+
Create a `DataSource` for the given context.
107+
108+
Parameters
109+
----------
110+
context:
111+
Information about the replay that's about to run
112+
113+
Returns
114+
-------
115+
DataSource[T]:
116+
Source for the replay
117+
118+
"""
119+
120+
121+
class DataSinkProvider(Protocol[T]):
122+
"""Interface for the provision of `DataSink`."""
123+
124+
@abc.abstractmethod
125+
def __call__(self, context: ReplayContext) -> DataSink[T]:
126+
"""
127+
Create a `DataSink` for the given context.
128+
129+
Parameters
130+
----------
131+
context:
132+
Information about the replay that's about to run
133+
134+
Returns
135+
-------
136+
DataSink[T]:
137+
Sink for the replay
138+
139+
"""
140+
141+
142+
@dataclasses.dataclass(frozen=True)
143+
class _ReplaySource(Generic[T]):
144+
"""Internal class used to store `DataSource` at runtime."""
145+
146+
name: str
147+
node: Node[T]
148+
data_source: DataSource[T]
149+
150+
151+
@dataclasses.dataclass(frozen=True)
152+
class _ReplaySink(Generic[T]):
153+
"""Internal class used to store `DataSink` at runtime."""
154+
155+
name: str
156+
nodes: list[Node[T]]
157+
data_sink: DataSink[T]
158+
159+
160+
@dataclasses.dataclass
161+
class ReplayDriver:
162+
"""
163+
Orchestrate the replay of data for dag.
164+
165+
This will:
166+
167+
- create the relevant `DataSource`s
168+
- create the relevant `DataSink`s
169+
- stream the data from the sources
170+
- inject the input data in the dag source nodes
171+
- execute the dag
172+
- collect the output data and pass it to the sink
173+
- close the sink at the end of the run
174+
175+
Notes
176+
-----
177+
Do not call the constructor directly, use `create` instead
178+
179+
"""
180+
181+
dag: Dag
182+
context: ReplayContext
183+
sources: list[_ReplaySource]
184+
sinks: list[_ReplaySink]
185+
current_time: pd.Timestamp
186+
187+
@staticmethod
188+
def create(
189+
dag: Dag,
190+
context: ReplayContext,
191+
data_source_providers: dict[str, DataSourceProvider],
192+
data_sink_providers: dict[str, DataSinkProvider],
193+
) -> "ReplayDriver":
194+
return ReplayDriver(
195+
dag,
196+
context,
197+
_create_sources(dag, context, data_source_providers),
198+
_create_sinks(dag, context, data_sink_providers),
199+
current_time=context.start,
200+
)
201+
202+
def run(self):
203+
while not self.is_done():
204+
self.run_cycle()
205+
for sink in self.sinks:
206+
sink.data_sink.close()
207+
208+
def is_done(self) -> bool:
209+
return self.current_time > self.context.end
210+
211+
def run_cycle(self):
212+
st = pd.Timestamp.now()
213+
records, next_timestamp = self.read_sources()
214+
if records or self.dag.get_next_timer() <= self.current_time:
215+
self.update_dag()
216+
self.flush_sinks()
217+
et = pd.Timestamp.now()
218+
warp_ratio = self.context.frequency / (et - st)
219+
logger.info(
220+
f"Running cycle={self.dag.get_cycle_id()} time={self.current_time} "
221+
f"records={records} warp={warp_ratio:.1f}"
222+
)
223+
224+
self.current_time = max(
225+
next_timestamp, self.current_time + self.context.frequency
226+
).ceil(self.context.frequency)
227+
228+
def read_sources(self) -> tuple[int, pd.Timestamp]:
229+
records = 0
230+
next_timestamp = self.context.end
231+
for replay_source in self.sources:
232+
source_data = replay_source.data_source.read_to(self.current_time)
233+
next_timestamp = min(next_timestamp, replay_source.data_source.get_next())
234+
if len(source_data) > 0:
235+
replay_source.node.set_stream(source_data)
236+
records += len(source_data)
237+
return records, next_timestamp
238+
239+
def update_dag(self):
240+
self.dag.stabilize(self.current_time)
241+
242+
def flush_sinks(self):
243+
for sink in self.sinks:
244+
for node in sink.nodes:
245+
if node.get_cycle_id() == self.dag.get_cycle_id():
246+
sink.data_sink.append(self.current_time, node.get_sink_value())
247+
248+
249+
def _create_sources(
250+
dag: Dag,
251+
context: ReplayContext,
252+
data_source_providers: dict[str, DataSourceProvider],
253+
) -> list[_ReplaySource]:
254+
source_nodes = dag.get_sources()
255+
nodes_names = sorted(source_nodes.keys())
256+
source_names = sorted(data_source_providers.keys())
257+
if nodes_names != source_names:
258+
raise ValueError(
259+
"Source node and DataSource names don't match: "
260+
f"{nodes_names} vs {source_names}"
261+
)
262+
return [
263+
_ReplaySource(name, source_nodes[name], data_source_providers[name](context))
264+
for name in data_source_providers.keys()
265+
]
266+
267+
268+
def _create_sinks(
269+
dag: Dag, context: ReplayContext, data_sink_providers: dict[str, DataSinkProvider]
270+
) -> list[_ReplaySink]:
271+
sink_nodes = dag.get_sinks()
272+
nodes_names = sorted(sink_nodes.keys())
273+
sink_names = sorted(data_sink_providers.keys())
274+
if nodes_names != sink_names:
275+
raise ValueError(
276+
"Sink node and DataSink names don't match: "
277+
f"{nodes_names} vs {sink_names}"
278+
)
279+
return [
280+
_ReplaySink(name, sink_nodes[name], data_sink_providers[name](context))
281+
for name in data_sink_providers.keys()
282+
]
283+
284+
285+
class IteratorDataSourceAdapter(DataSource[T]):
286+
"""
287+
Adapter between an iterator of `DataSource` and a DataSource.
288+
289+
This can be used to stitch together various `DataSource` for incremental date range
290+
"""
291+
292+
def __init__(
293+
self,
294+
sources: Iterator[DataSource[T]],
295+
empty: T,
296+
concatenator: Callable[[T, T], T],
297+
):
298+
self._sources = sources
299+
self._empty = empty
300+
self._concatenator = concatenator
301+
self._current = self._next()
302+
303+
def read_to(self, timestamp: pd.Timestamp) -> T:
304+
if self._current is None:
305+
return self._empty
306+
else:
307+
this_batch = self._current.read_to(timestamp)
308+
while self._current is not None and self._current.get_next() == UTC_MAX:
309+
self._current = self._next()
310+
next_batch = (
311+
self._empty
312+
if self._current is None
313+
else self._current.read_to(timestamp)
314+
)
315+
if next_batch and this_batch:
316+
this_batch = self._concatenator(this_batch, next_batch)
317+
elif next_batch:
318+
this_batch = next_batch
319+
320+
return this_batch
321+
322+
def get_next(self) -> pd.Timestamp:
323+
if self._current is None:
324+
return UTC_MAX
325+
else:
326+
return self._current.get_next()
327+
328+
def _next(self) -> Optional[DataSource]:
329+
try:
330+
return next(self._sources)
331+
except StopIteration:
332+
return None

docs/api/replay.md

+7
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
2+
3+
4+
::: beavers.replay
5+
options:
6+
heading_level: 2
7+
show_source: true

docs/index.md

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,3 @@
11
# Beavers
22

3-
[**Beavers**](https://github.com/tradewelltech/beavers) is a python library for stream processing, optimize for analytics.
3+
[**Beavers**](https://github.com/tradewelltech/beavers) is a python library for stream processing, optimized for analytics.

0 commit comments

Comments
 (0)