diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 000000000..fe6bd5064 --- /dev/null +++ b/.dockerignore @@ -0,0 +1,2 @@ +.venv/ +**/__pycache__/ diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index 331293840..35fb66523 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -19,4 +19,4 @@ jobs: uses: coverallsapp/github-action@v2 with: github-token: ${{ secrets.GITHUB_TOKEN }} - path-to-lcov: .python_coverage.xml + path-to-lcov: coverage_output/.python_coverage.xml diff --git a/.gitignore b/.gitignore index f18c2ed27..eef7ca0c9 100644 --- a/.gitignore +++ b/.gitignore @@ -9,4 +9,6 @@ infrastructure/Pulumi.infrastructure.yaml .envrc .coverage* .coverage/ -coverage.xml \ No newline at end of file +coverage.xmlcoverage.xml + +todos.md diff --git a/.mise.toml b/.mise.toml index 929a79b1f..41f45dcc9 100644 --- a/.mise.toml +++ b/.mise.toml @@ -32,8 +32,11 @@ uvx ty check [tasks."python:test"] description = "Run Python tests" run = """ +mkdir -p coverage_output +docker compose down --volumes --remove-orphans docker compose build tests -docker compose run -T tests +docker compose run --rm --no-TTY tests +docker compose down --volumes --remove-orphans """ [tasks."application:service:run:production"] @@ -56,21 +59,21 @@ uv run uvicorn src.{{arg(name="service_name")}}.main:application --reload description = "Run integration tests" run = """ cd application/{{arg(name="service_name")}} -docker-compose up --build --abort-on-container-exit --remove-orphans +docker compose up --build --abort-on-container-exit --remove-orphans """ [tasks."application:service:test:behavioral"] description = "Run behavioral tests" run = """ cd application/{{arg(name="service_name")}} -docker-compose up --build --abort-on-container-exit +docker compose up --build --abort-on-container-exit """ [tasks."application:service:cleanup:behavioral"] description = "Clean up behavioral tests" run = """ cd application/{{arg(name="service_name")}} -docker-compose down -v +docker compose down -v """ [tasks."lint"] diff --git a/Dockerfile.test b/Dockerfile.test index f24a29c57..8c5ae264c 100644 --- a/Dockerfile.test +++ b/Dockerfile.test @@ -1,9 +1,33 @@ FROM python:3.13 + COPY --from=ghcr.io/astral-sh/uv:latest /uv /bin/uv +RUN apt-get update && apt-get install -y \ + build-essential \ + clang \ + libc6-dev \ + linux-libc-dev \ + && rm -rf /var/lib/apt/lists/* + ENV PYTEST_ADDOPTS="--rootdir=/tests" +ENV PYTHON=1 WORKDIR /tests -COPY pyproject.toml uv.lock . -RUN uv sync --all-groups +COPY pyproject.toml uv.lock ./ + +COPY application/datamanager/pyproject.toml ./application/datamanager/ + +COPY application/positionmanager/pyproject.toml ./application/positionmanager/ + +COPY application/predictionengine/pyproject.toml ./application/predictionengine/ + +COPY infrastructure/pyproject.toml ./infrastructure/ + +COPY workflows/pyproject.toml ./workflows/ + +RUN uv sync --all-packages --dev + +COPY application/ ./application/ + +RUN mkdir -p /tests/coverage_output diff --git a/application/datamanager/Dockerfile b/application/datamanager/Dockerfile index c14d3f7f3..3933837aa 100644 --- a/application/datamanager/Dockerfile +++ b/application/datamanager/Dockerfile @@ -1,9 +1,11 @@ -FROM python:3.13 +FROM python:3.12 + COPY --from=ghcr.io/astral-sh/uv:latest /uv /bin/uv WORKDIR /app COPY pyproject.toml ./ + RUN uv sync --no-dev COPY ./src ./src diff --git a/application/datamanager/pyproject.toml b/application/datamanager/pyproject.toml index 9dcfdb28d..a18dd7b95 100644 --- a/application/datamanager/pyproject.toml +++ b/application/datamanager/pyproject.toml @@ -12,7 +12,8 @@ dependencies = [ "loguru>=0.7.3", "google-cloud-storage>=2.16.0", "httpx>=0.28.1", - "datamanager" + "prometheus-fastapi-instrumentator>=7.1.0", + "loguru>=0.7.3", ] [tool.hatch.build.targets.wheel] @@ -21,9 +22,3 @@ packages = ["datamanager"] [build-system] requires = ["hatchling"] build-backend = "hatchling.build" - -[dependency-groups] -dev = [ - "behave>=1.2.6", - "requests>=2.31.0", -] diff --git a/application/datamanager/src/datamanager/config.py b/application/datamanager/src/datamanager/config.py index b6280a9bb..c3110c462 100644 --- a/application/datamanager/src/datamanager/config.py +++ b/application/datamanager/src/datamanager/config.py @@ -5,14 +5,14 @@ class Polygon(BaseModel): - api_key: str | None = Field(default=os.getenv("POLYGON_API_KEY")) + api_key: str = Field(default=os.getenv("POLYGON_API_KEY", "")) base_url: str = "https://api.polygon.io" daily_bars: str = "/v2/aggs/grouped/locale/us/market/stocks/" class Bucket(BaseModel): - name: str | None = Field(default=os.getenv("DATA_BUCKET")) - project: str | None = Field(default=os.getenv("GCP_PROJECT")) + name: str = Field(default=os.getenv("DATA_BUCKET", "")) + project: str = Field(default=os.getenv("GCP_PROJECT", "")) @computed_field def daily_bars_path(self) -> str: diff --git a/application/positionmanager/Dockerfile b/application/positionmanager/Dockerfile index 609297e60..9192cccf0 100644 --- a/application/positionmanager/Dockerfile +++ b/application/positionmanager/Dockerfile @@ -1,4 +1,5 @@ -FROM python:3.13 +FROM python:3.12 + COPY --from=ghcr.io/astral-sh/uv:latest /uv /bin/uv ENV PYTHONPATH=/app/src @@ -6,6 +7,7 @@ ENV PYTHONPATH=/app/src WORKDIR /app COPY pyproject.toml ./ + RUN uv sync --no-dev COPY ./src ./src diff --git a/application/positionmanager/src/positionmanager/clients.py b/application/positionmanager/src/positionmanager/clients.py index 846add35a..cf4ff79f8 100644 --- a/application/positionmanager/src/positionmanager/clients.py +++ b/application/positionmanager/src/positionmanager/clients.py @@ -23,7 +23,12 @@ def __init__( def get_cash_balance(self) -> Money: account = self.trading_client.get_account() - return Money.from_float(float(account.cash)) + cash_balance = getattr(account, "cash", None) + + if cash_balance is None: + raise ValueError("Cash balance is not available") + + return Money.from_float(float(cash_balance)) def place_notional_order( self, @@ -89,7 +94,7 @@ def get_data( data = ( data.sort("date") - .pivot(index="date", columns="ticker", values="close_price") + .pivot(on="ticker", index="date", values="close_price") .with_columns(pl.all().exclude("date").cast(pl.Float64)) ) diff --git a/application/positionmanager/src/positionmanager/main.py b/application/positionmanager/src/positionmanager/main.py index 990656e49..263f60a28 100644 --- a/application/positionmanager/src/positionmanager/main.py +++ b/application/positionmanager/src/positionmanager/main.py @@ -3,11 +3,9 @@ from datetime import datetime, timedelta import polars as pl from typing import Dict, Any - from .models import Money, DateRange, PredictionPayload from .clients import AlpacaClient, DataClient from .portfolio import PortfolioOptimizer - from prometheus_fastapi_instrumentator import Instrumentator @@ -25,12 +23,12 @@ def get_health() -> dict[str, str]: @application.post("/positions") def create_position(payload: PredictionPayload) -> Dict[str, Any]: alpaca_client = AlpacaClient( - api_key=os.getenv("ALPACA_API_KEY"), - api_secret=os.getenv("ALPACA_API_SECRET"), + api_key=os.getenv("ALPACA_API_KEY", ""), + api_secret=os.getenv("ALPACA_API_SECRET", ""), paper=os.getenv("ALPACA_PAPER", "true").lower() == "true", ) - data_client = DataClient(datamanager_base_url=os.getenv("DATAMANAGER_BASE_URL")) + data_client = DataClient(datamanager_base_url=os.getenv("DATAMANAGER_BASE_URL", "")) portfolio_optimizer = PortfolioOptimizer( minimum_portfolio_tickers=int(os.getenv("MINIMUM_PORTFOLIO_TICKERS", "5")), @@ -134,8 +132,8 @@ def create_position(payload: PredictionPayload) -> Dict[str, Any]: @application.delete("/positions") def delete_positions() -> Dict[str, Any]: alpaca_client = AlpacaClient( - api_key=os.getenv("ALPACA_API_KEY"), - api_secret=os.getenv("ALPACA_API_SECRET"), + api_key=os.getenv("ALPACA_API_KEY", ""), + api_secret=os.getenv("ALPACA_API_SECRET", ""), paper=os.getenv("ALPACA_PAPER", "true").lower() == "true", ) diff --git a/application/predictionengine/Dockerfile b/application/predictionengine/Dockerfile new file mode 100644 index 000000000..57325507e --- /dev/null +++ b/application/predictionengine/Dockerfile @@ -0,0 +1,19 @@ +FROM python:3.12 + +COPY --from=ghcr.io/astral-sh/uv:latest /uv /bin/uv + +ENV PYTHONPATH=/app/src + +WORKDIR /app + +COPY pyproject.toml ./ + +RUN uv sync --no-dev + +COPY ./src ./src + +COPY ./miniature_temporal_fusion_transformer.safetensor ./src/predictionengine/miniature_temporal_fusion_transformer.safetensor + +EXPOSE 8080 + +ENTRYPOINT ["uv", "run", "uvicorn", "predictionengine.main:application", "--host", "0.0.0.0", "--port", "8080", "--app-dir", "src"] diff --git a/application/predictionengine/pyproject.toml b/application/predictionengine/pyproject.toml new file mode 100644 index 000000000..930c98b58 --- /dev/null +++ b/application/predictionengine/pyproject.toml @@ -0,0 +1,20 @@ +[project] +name = "predictionengine" +version = "0.1.0" +description = "Prediction engine service" +requires-python = ">=3.12" # possibly 3.10 +dependencies = [ + "fastapi>=0.115.12", + "uvicorn>=0.34.2", + "tinygrad>=0.10.3", + "polars>=1.29.0", + "category-encoders>=2.8.1", + "requests>=2.31.0", +] + +[tool.hatch.build.targets.wheel] +packages = ["predictionengine"] + +[build-system] +requires = ["hatchling"] +build-backend = "hatchling.build" diff --git a/application/predictionengine/src/predictionengine/dataset.py b/application/predictionengine/src/predictionengine/dataset.py new file mode 100644 index 000000000..8dd7a9b24 --- /dev/null +++ b/application/predictionengine/src/predictionengine/dataset.py @@ -0,0 +1,189 @@ +from typing import Dict, List, Any, Tuple, Generator +from tinygrad.tensor import Tensor +import polars as pl +from category_encoders import OrdinalEncoder + + +continuous_variable_columns = [ + "open_price", + "high_price", + "low_price", + "close_price", + "volume", + "volume_weighted_average_price", +] + + +class DataSet: + def __init__( + self, + batch_size: int, + sequence_length: int, + sample_count: int, + scalers: Dict[str, Dict[str, Tensor]] = {}, + ) -> None: + self.batch_size = batch_size + self.sequence_length = sequence_length + self.sample_count = sample_count + self.scalers = scalers if scalers is not None else {} + self.preprocessors: Dict[str, Any] = {} + + def __len__(self) -> int: + return (self.sample_count + self.batch_size - 1) // self.batch_size + + def load_data(self, data: pl.DataFrame) -> None: + data = data.with_columns( + [ + pl.col("timestamp").str.strptime(pl.Datetime).cast(pl.Date), + pl.col("open_price").cast(pl.Float64), + pl.col("high_price").cast(pl.Float64), + pl.col("low_price").cast(pl.Float64), + pl.col("close_price").cast(pl.Float64), + pl.col("volume").cast(pl.Float64), + pl.col("volume_weighted_average_price").cast(pl.Float64), + pl.col("ticker").cast(pl.Utf8), + ] + ) + + self.preprocessors["indices"] = { + col: idx for idx, col in enumerate(data.columns) + } + + data = data.unique(subset=["ticker", "timestamp"]) + + tickers = data.select("ticker").unique() + minimum_timestamp = data.select(pl.col("timestamp").min())[0, 0] + maximum_timestamp = data.select(pl.col("timestamp").max())[0, 0] + + full_dates = pl.DataFrame( + { + "timestamp": pl.date_range( + start=minimum_timestamp, + end=maximum_timestamp, + interval="1d", + closed="both", + eager=True, + ) + } + ) + + full_tickers_and_dates = tickers.join(full_dates, how="cross") + + full_tickers_and_dates = full_tickers_and_dates.with_columns( + [ + pl.col("timestamp") + .rank(method="dense") + .cast(pl.Int32) + .alias("time_index") + - 1 + ] + ) + + data = full_tickers_and_dates.join(data, on=["ticker", "timestamp"], how="left") + + data = data.sort(["ticker", "timestamp"]) + + data = data.group_by("ticker").map_groups( + lambda df: df.sort("timestamp").with_columns( + [ + pl.col(col) + .interpolate() + .fill_null(strategy="forward") + .fill_null(strategy="backward") + for col in (continuous_variable_columns) + ] + ) + ) + + ticker_encoder = OrdinalEncoder( + cols=["ticker"], + handle_unknown="use_encoded_value", + handle_missing="use_encoded_value", + ) + self.preprocessors["ticker_encoder"] = ticker_encoder + + ticker_df = data.select("ticker").to_pandas() + encoded_tickers = ticker_encoder.fit_transform(ticker_df) + + data = data.with_columns(pl.Series("ticker", encoded_tickers["ticker"])) + + if self.scalers is None or len(self.scalers) == 0: + self.scalers: Dict[str, Dict[str, Tensor]] = {} + for ticker_key, group in data.group_by("ticker"): + ticker = ticker_key[0] + means = group[continuous_variable_columns].mean() + standard_deviations = group[continuous_variable_columns].std() + + self.scalers[str(ticker)] = { + "means": Tensor(means.to_numpy()), + "standard_deviations": Tensor(standard_deviations.to_numpy()), + } + + groups: List[Tensor] = [] + for ticker_key, group in data.group_by("ticker"): + ticker = ticker_key[0] + means = self.scalers[str(ticker)]["means"] + standard_deviations = self.scalers[str(ticker)]["standard_deviations"] + + ticker_column = Tensor(group.select("ticker").to_numpy()) + group_data = Tensor(group.select(continuous_variable_columns).to_numpy()) + + scaled_group = (group_data.sub(means)).div(standard_deviations) + + combined_group = ticker_column.cat(scaled_group, dim=1) + groups.append(combined_group) + + output_data = Tensor.empty(groups[0].shape) + output_data = output_data.cat(*groups, dim=0) + + self.data = output_data + + def get_preprocessors(self) -> Dict[str, Any]: + if not self.preprocessors: + raise ValueError("Preprocessors have not been initialized.") + + means_by_ticker = { + ticker: values["means"] for ticker, values in self.scalers.items() + } + standard_deviations_by_ticker = { + ticker: values["standard_deviations"] + for ticker, values in self.scalers.items() + } + + return { + "means_by_ticker": means_by_ticker, + "standard_deviations_by_ticker": standard_deviations_by_ticker, + "ticker_encoder": self.preprocessors["ticker_encoder"], + "indices": self.preprocessors["indices"], + } + + def batches(self) -> Generator[Tuple[Tensor, Tensor, Tensor], None, None]: + close_price_idx = self.preprocessors["indices"]["close_price"] + + for i in range(0, self.sample_count, self.batch_size): + batch_data = self.data[i : i + self.batch_size + self.sequence_length] + + if batch_data.shape[0] < self.batch_size + self.sequence_length: + padding = Tensor.zeros( + ( + self.batch_size + self.sequence_length - batch_data.shape[0], + *batch_data.shape[1:], + ) + ) + batch_data = Tensor(batch_data).stack(padding, dim=0) + + tickers = batch_data[: self.batch_size, 0] + + historical_features = Tensor.stack( + *[ + batch_data[i : i + self.sequence_length, 1:] + for i in range(self.batch_size) + ], + dim=0, + ) + + targets = batch_data[: self.batch_size, close_price_idx].reshape( + self.batch_size, 1 + ) + + yield tickers, historical_features, targets diff --git a/application/predictionengine/src/predictionengine/gated_residual_network.py b/application/predictionengine/src/predictionengine/gated_residual_network.py new file mode 100644 index 000000000..85874c50e --- /dev/null +++ b/application/predictionengine/src/predictionengine/gated_residual_network.py @@ -0,0 +1,46 @@ +from typing import cast +from tinygrad.tensor import Tensor +from tinygrad.nn import Linear, LayerNorm + + +class GatedResidualNetwork: + def __init__( + self, + input_size: int, + hidden_size: int, + output_size: int, + ) -> None: + output_size = output_size if output_size is not None else input_size + + self.input_size = input_size + self.hidden_size = hidden_size + self.output_size = output_size + + self.dense_input = Linear(in_features=input_size, out_features=hidden_size) + self.dense_output = Linear(in_features=hidden_size, out_features=output_size) + self.gate = Linear(in_features=hidden_size, out_features=output_size) + self.layer_normalizer = LayerNorm(normalized_shape=output_size) + + self.residual_projection = None + if input_size != output_size: + self.residual_projection = Linear( + in_features=input_size, out_features=output_size + ) + + def forward( + self, + input: Tensor, + ) -> Tensor: + hidden_state = self.dense_input(input).relu() + + output_state = self.dense_output(hidden_state) + + gate_state = self.gate(hidden_state).sigmoid() + + if self.residual_projection is not None: + residual = self.residual_projection(input) + else: + residual = input + + gated_output = cast(Tensor, gate_state * output_state + residual) + return self.layer_normalizer(gated_output) diff --git a/application/predictionengine/src/predictionengine/long_short_term_memory.py b/application/predictionengine/src/predictionengine/long_short_term_memory.py new file mode 100644 index 000000000..38d3fe606 --- /dev/null +++ b/application/predictionengine/src/predictionengine/long_short_term_memory.py @@ -0,0 +1,62 @@ +from typing import List, Tuple +from tinygrad.tensor import Tensor +from tinygrad.nn import LSTMCell + + +class LongShortTermMemory: + def __init__( + self, + input_size: int, + hidden_size: int, + layer_count: int = 1, + dropout_rate: float = 0.0, + ) -> None: + self.hidden_size = hidden_size + self.layer_count = layer_count + self.dropout_rate = dropout_rate + + self.layers: List[LSTMCell] = [] + for index in range(layer_count): + input_size = input_size if index == 0 else self.hidden_size + self.layers.append(LSTMCell(input_size, self.hidden_size)) + + def forward( + self, + input: Tensor, + ) -> Tuple[Tensor, Tuple[Tensor, Tensor]]: + batch_size, sequence_length, _ = input.shape + + hidden_state = Tensor.zeros( + self.layer_count, batch_size, self.hidden_size + ).contiguous() + cell_state = Tensor.zeros( + self.layer_count, batch_size, self.hidden_size + ).contiguous() + + outputs = [] + + for t in range(int(sequence_length)): + layer_input = input[:, t] + + for index, layer in enumerate(self.layers): + layer_hidden_state, layer_cell_state = layer( + x=layer_input, + hc=( + hidden_state[index], + cell_state[index], + ), + ) + + hidden_state[index] = layer_hidden_state + cell_state[index] = layer_cell_state + + if self.dropout_rate > 0.0 and index < self.layer_count - 1: + hidden_state[index].train() + hidden_state[index] = hidden_state[index].dropout(self.dropout_rate) + + layer_input = layer_hidden_state + + outputs.append(hidden_state[-1]) + + output_tensor = Tensor.stack(*outputs, dim=1) + return output_tensor, (hidden_state, cell_state) diff --git a/application/predictionengine/src/predictionengine/loss_function.py b/application/predictionengine/src/predictionengine/loss_function.py new file mode 100644 index 000000000..e87aa5228 --- /dev/null +++ b/application/predictionengine/src/predictionengine/loss_function.py @@ -0,0 +1,16 @@ +from tinygrad.tensor import Tensor + +Quantiles = tuple[float, float, float] | tuple[float, float, float, float, float] + + +def quantile_loss( + y_pred: Tensor, y_true: Tensor, quantiles: Quantiles | None = None +) -> Tensor: + if quantiles is None: + quantiles = (0.25, 0.5, 0.75) + loss: Tensor = Tensor.zeros(1) + for q in quantiles: + error: Tensor = y_true - y_pred + loss += Tensor.maximum(q * error, (q - 1) * error).mean() + + return loss diff --git a/application/predictionengine/src/predictionengine/main.py b/application/predictionengine/src/predictionengine/main.py new file mode 100644 index 000000000..888036b65 --- /dev/null +++ b/application/predictionengine/src/predictionengine/main.py @@ -0,0 +1,160 @@ +import os +import traceback +from typing import AsyncGenerator +from datetime import date, datetime, timedelta +from contextlib import asynccontextmanager +import requests +import polars as pl +from fastapi import FastAPI, Request, Response, status, HTTPException +from prometheus_fastapi_instrumentator import Instrumentator +from loguru import logger +from .miniature_temporal_fusion_transformer import MiniatureTemporalFusionTransformer +from .dataset import DataSet +from .models import PredictionResponse + + +@asynccontextmanager +async def lifespan(app: FastAPI) -> AsyncGenerator[None]: + datamanager_base_url = os.getenv("DATAMANAGER_BASE_URL", "") + app.state.datamanager_base_url = datamanager_base_url + + app.state.model = None + yield + + +application = FastAPI(lifespan=lifespan) +Instrumentator().instrument(application).expose(application) + + +@application.get("/health") +async def health_check() -> Response: + return Response(status_code=status.HTTP_200_OK) + + +def fetch_historical_data( + datamanager_url: str, start_date: date, end_date: date +) -> pl.DataFrame: + url = f"{datamanager_url}/equity-bars" + parameters = { + "start_date": start_date.isoformat(), + "end_date": end_date.isoformat(), + } + + response = requests.get(url, params=parameters, timeout=30) + response.raise_for_status() + + response = requests.get(url, params=parameters, timeout=30) + response.raise_for_status() + + import pyarrow as pa + + buffer = pa.py_buffer(response.content) + reader = pa.ipc.RecordBatchStreamReader(buffer) + table = reader.read_all() + + return pl.DataFrame(pl.from_arrow(table)) + + +def load_or_initialize_model(data: pl.DataFrame) -> MiniatureTemporalFusionTransformer: + dataset = DataSet( + batch_size=32, + sequence_length=30, + sample_count=len(data), + ) + dataset.load_data(data) + preprocessors = dataset.get_preprocessors() + + model = MiniatureTemporalFusionTransformer( + input_size=6, + hidden_size=128, + output_size=3, + layer_count=2, + ticker_count=len(data["ticker"].unique()), + embedding_size=32, + attention_head_count=4, + means_by_ticker=preprocessors["means_by_ticker"], + standard_deviations_by_ticker=preprocessors["standard_deviations_by_ticker"], + ticker_encoder=preprocessors["ticker_encoder"], + dropout_rate=0.0, + ) + + model_path = "miniature_temporal_fusion_transformer.safetensor" + if os.path.exists(model_path): + try: + model.load(model_path) + logger.info("Loaded existing model weights") + except Exception as e: + logger.warning(f"Failed to load model weights: {e}") + + return model + + +@application.post("/create-predictions", response_model=PredictionResponse) +async def create_predictions( + request: Request, +) -> PredictionResponse: + try: + end_date = datetime.now().date() + start_date = end_date - timedelta(days=30) + + logger.info(f"Fetching data from {start_date} to {end_date}") + data = fetch_historical_data( + request.app.state.datamanager_base_url, start_date, end_date + ) + + if data.is_empty(): + raise HTTPException( + status_code=404, detail="No data available for prediction" + ) + + if request.app.state.model is None: + logger.info("Initializing model") + request.app.state.model = load_or_initialize_model(data) + + model = request.app.state.model + + unique_tickers = data["ticker"].unique().to_list() + predictions = {} + + for ticker in unique_tickers: + ticker_data = data.filter(pl.col("ticker") == ticker) + if len(ticker_data) < 30: + logger.warning(f"Insufficient data for ticker {ticker}") + continue + + recent_data = ticker_data.tail(30) + + dataset = DataSet( + batch_size=1, + sequence_length=30, + sample_count=1, + ) + dataset.load_data(recent_data) + + for tickers_batch, features_batch, _ in dataset.batches(): + percentile_25, percentile_50, percentile_75 = model.predict( + tickers_batch, features_batch + ) + + predictions[ticker] = { + "percentile_25": float(percentile_25[0]), + "percentile_50": float(percentile_50[0]), + "percentile_75": float(percentile_75[0]), + } + break + + if not predictions: + raise HTTPException( + status_code=404, detail="No predictions could be generated" + ) + + return PredictionResponse(predictions=predictions) + + except HTTPException: + raise + except Exception as e: + logger.error(f"Error creating predictions: {e}") + logger.error(traceback.format_exc()) + raise HTTPException( + status_code=500, detail=f"Internal server error: {str(e)}" + ) from e diff --git a/application/predictionengine/src/predictionengine/miniature_temporal_fusion_transformer.py b/application/predictionengine/src/predictionengine/miniature_temporal_fusion_transformer.py new file mode 100644 index 000000000..542a3410c --- /dev/null +++ b/application/predictionengine/src/predictionengine/miniature_temporal_fusion_transformer.py @@ -0,0 +1,197 @@ +from typing import Dict +from category_encoders import OrdinalEncoder +from .ticker_embedding import TickerEmbedding +from .long_short_term_memory import LongShortTermMemory +from .gated_residual_network import GatedResidualNetwork +from .multi_head_self_attention import MultiHeadSelfAttention +from .post_processor import PostProcessor +from tinygrad.tensor import Tensor +from tinygrad.nn.optim import Adam +from tinygrad.nn.state import ( + get_parameters, + get_state_dict, + safe_save, + safe_load, + load_state_dict, +) +from typing import Tuple, List +import numpy as np +from .dataset import DataSet +from .loss_function import quantile_loss + + +class MiniatureTemporalFusionTransformer: + def __init__( + self, + input_size: int, + hidden_size: int, + output_size: int, + layer_count: int, + ticker_count: int, + embedding_size: int, + attention_head_count: int, + means_by_ticker: Dict[str, Tensor], + standard_deviations_by_ticker: Dict[str, Tensor], + ticker_encoder: OrdinalEncoder, + dropout_rate: float, # non-zero indicates training + ) -> None: + self.ticker_embedding = TickerEmbedding( + ticker_count=ticker_count, + embedding_size=embedding_size, + ) + + self.lstm_encoder = LongShortTermMemory( + input_size=input_size, + hidden_size=hidden_size + embedding_size, + layer_count=layer_count, + dropout_rate=dropout_rate, + ) + + self.feature_processor = GatedResidualNetwork( + input_size=hidden_size, + hidden_size=hidden_size, + output_size=hidden_size, + ) + + self.self_attention = MultiHeadSelfAttention( + heads_count=attention_head_count, + embedding_size=hidden_size, + ) + + self.output_layer = GatedResidualNetwork( + input_size=hidden_size, + hidden_size=hidden_size, + output_size=output_size, + ) + + self.post_processor = PostProcessor( + means_by_ticker=means_by_ticker, + standard_deviations_by_ticker=standard_deviations_by_ticker, + ticker_encoder=ticker_encoder, + ) + + self.parameters = get_parameters(self) + + def forward( + self, + tickers: Tensor, + features: Tensor, + ) -> Tuple[Tensor, Tensor, Tuple[np.ndarray, np.ndarray, np.ndarray]]: + ticker_embeddings = self.ticker_embedding.forward(tickers) + + # get ticker embeddings and expand to (batch_size, seq_len, embedding_dim) + ticker_embeddings = self.ticker_embedding.forward( + tickers + ) # (batch_size, embedding_dim) + ticker_embeddings = ticker_embeddings.unsqueeze(1).expand( + -1, features.size(1), -1 + ) + lstm_input = Tensor.cat([features, ticker_embeddings], dim=-1) + + lstm_output, _ = self.lstm_encoder.forward(lstm_input) + + processed_features = self.feature_processor.forward(lstm_output) + + attention_output, attention_weights = self.self_attention.forward( + processed_features + ) + + output = self.output_layer.forward(attention_output) + + percentile_25, percentile_50, percentile_75 = ( + self.post_processor.post_process_predictions( + tickers.numpy(), + output.numpy(), + ) + ) + + return output, attention_weights, (percentile_25, percentile_50, percentile_75) + + def train( + self, + dataset: DataSet, + epoch_count: int, + learning_rate: float = 1e-3, + ) -> List[float]: + optimizer = Adam(params=self.parameters, lr=learning_rate) + + quantiles = (0.25, 0.50, 0.75) + losses: List[float] = [] + + for _ in range(epoch_count): + epoch_loss = 0.0 + + for batch in dataset.batches(): + for tickers, historical_features, targets in batch: + predictions, _, _ = self.forward( + Tensor(tickers), + Tensor(historical_features), + ) + + loss = quantile_loss(predictions, Tensor(targets), quantiles) + + optimizer.zero_grad() + loss.backward() + optimizer.step() + + epoch_loss += loss.numpy().item() + + avgerage_epoch_loss = epoch_loss / len(dataset) + losses.append(avgerage_epoch_loss) + + return losses + + def validate( + self, + dataset: DataSet, + ) -> float: + total_loss = 0.0 + batch_count = len(dataset) + + for batch in dataset.batches(): + tickers, features, targets = batch + tickers, features, targets = ( + Tensor(tickers), + Tensor(features), + Tensor(targets), + ) + + output, _, _ = self.forward(tickers, features) + + loss = quantile_loss(output, targets) + + total_loss += loss.item() + + average_loss = total_loss / batch_count + + return average_loss + + def save( + self, + path_and_file: str = "miniature_temporal_fusion_transformer.safetensor", + ) -> None: + states = get_state_dict(self) + safe_save(states, path_and_file) + + def load( + self, + path_and_file: str = "miniature_temporal_fusion_transformer.safetensor", + ) -> None: + states = safe_load(path_and_file) + load_state_dict(self, states) + + def predict( + self, + tickers: Tensor, + input: Tensor, + ) -> Tuple[np.ndarray, np.ndarray, np.ndarray]: + predictions, _, _ = self.forward(tickers, input) + + percentile_25, percentile_50, percentile_75 = ( + self.post_processor.post_process_predictions( + tickers.numpy(), + predictions.numpy(), + ) + ) + + return percentile_25, percentile_50, percentile_75 diff --git a/application/predictionengine/src/predictionengine/models.py b/application/predictionengine/src/predictionengine/models.py new file mode 100644 index 000000000..37ba3e0d9 --- /dev/null +++ b/application/predictionengine/src/predictionengine/models.py @@ -0,0 +1,6 @@ +from pydantic import BaseModel +from typing import Dict + + +class PredictionResponse(BaseModel): + predictions: Dict[str, Dict[str, float]] diff --git a/application/predictionengine/src/predictionengine/multi_head_self_attention.py b/application/predictionengine/src/predictionengine/multi_head_self_attention.py new file mode 100644 index 000000000..5ba0f83b9 --- /dev/null +++ b/application/predictionengine/src/predictionengine/multi_head_self_attention.py @@ -0,0 +1,62 @@ +from typing import Tuple, cast +from tinygrad.tensor import Tensor +from tinygrad.nn import Linear +from tinygrad.dtype import dtypes + + +class MultiHeadSelfAttention: + def __init__( + self, + heads_count: int, + embedding_size: int, + ) -> None: + if embedding_size % heads_count != 0: + raise ValueError("Embedding dimension must be divisible by heads count") + + self.heads_count = heads_count + self.embedding_size = embedding_size + self.heads_dimension = embedding_size // heads_count + + self.query_weight = Linear(self.embedding_size, self.embedding_size) + self.key_weight = Linear(self.embedding_size, self.embedding_size) + self.value_weight = Linear(self.embedding_size, self.embedding_size) + + self.fully_connected_out = Linear(self.embedding_size, self.embedding_size) + + self.scale = Tensor(self.heads_dimension**0.5, dtype=dtypes.float32) + + def forward( + self, + input: Tensor, + ) -> Tuple[Tensor, Tensor]: + batch_size, sequence_length, _ = input.shape + + query_weights = self.query_weight(input) + key_weights = self.key_weight(input) + value_weights = self.value_weight(input) + + query_weights = query_weights.view( + (batch_size, sequence_length, self.heads_count, self.heads_dimension), + ).transpose(1, 2) + key_weights = key_weights.view( + (batch_size, sequence_length, self.heads_count, self.heads_dimension), + ).transpose(1, 2) + value_weights = value_weights.view( + (batch_size, sequence_length, self.heads_count, self.heads_dimension), + ).transpose(1, 2) + + attention_scores = ( + query_weights.matmul(key_weights.transpose(-2, -1)) / self.scale + ) + + attention_weights: Tensor = cast(Tensor, attention_scores).softmax(axis=-1) + + attention_output = attention_weights.matmul(value_weights) + + attention_output = attention_output.transpose(1, 2).reshape( + batch_size, sequence_length, self.embedding_size + ) + + output = self.fully_connected_out(attention_output) + + return output, attention_weights diff --git a/application/predictionengine/src/predictionengine/post_processor.py b/application/predictionengine/src/predictionengine/post_processor.py new file mode 100644 index 000000000..d0b615010 --- /dev/null +++ b/application/predictionengine/src/predictionengine/post_processor.py @@ -0,0 +1,51 @@ +from typing import Dict, Tuple, Any +from tinygrad.tensor import Tensor +from category_encoders import OrdinalEncoder +import numpy as np +import polars as pl + + +class PostProcessor: + def __init__( + self, + means_by_ticker: Dict[str, Tensor], + standard_deviations_by_ticker: Dict[str, Tensor], + ticker_encoder: OrdinalEncoder, + ) -> None: + self.means_by_ticker = means_by_ticker + self.standard_deviations_by_ticker = standard_deviations_by_ticker + self.ticker_encoder = ticker_encoder + + def post_process_predictions( + self, + encoded_tickers: np.ndarray, + predictions: np.ndarray, + ) -> Tuple[ + np.ndarray[Any, np.dtype[np.float64]], + np.ndarray[Any, np.dtype[np.float64]], + np.ndarray[Any, np.dtype[np.float64]], + ]: + decoded_tickers = self.ticker_encoder.inverse_transform( + pl.DataFrame( + { + "ticker": encoded_tickers, + } + ).to_pandas() + )["ticker"] + + rescaled_predictions = np.empty_like(predictions) + + for i, ticker in enumerate(decoded_tickers): + if ( + ticker not in self.means_by_ticker + or ticker not in self.standard_deviations_by_ticker + ): + raise ValueError(f"Statistics not found for ticker: {ticker}") + mean = self.means_by_ticker[ticker].numpy() + standard_deviation = self.standard_deviations_by_ticker[ticker].numpy() + rescaled_predictions[i, :] = predictions[i, :] * standard_deviation + mean + percentile_25 = np.percentile(rescaled_predictions, 25, axis=-1) + percentile_50 = np.percentile(rescaled_predictions, 50, axis=-1) + percentile_75 = np.percentile(rescaled_predictions, 75, axis=-1) + + return percentile_25, percentile_50, percentile_75 diff --git a/application/predictionengine/src/predictionengine/ticker_embedding.py b/application/predictionengine/src/predictionengine/ticker_embedding.py new file mode 100644 index 000000000..ab57ed073 --- /dev/null +++ b/application/predictionengine/src/predictionengine/ticker_embedding.py @@ -0,0 +1,10 @@ +from tinygrad.nn import Embedding +from tinygrad.tensor import Tensor + + +class TickerEmbedding: + def __init__(self, ticker_count: int, embedding_size: int) -> None: + self.embedding = Embedding(ticker_count, embedding_size) + + def forward(self, tickers: Tensor) -> Tensor: + return self.embedding(tickers) diff --git a/application/predictionengine/tests/test_dataset.py b/application/predictionengine/tests/test_dataset.py new file mode 100644 index 000000000..c43cb19a6 --- /dev/null +++ b/application/predictionengine/tests/test_dataset.py @@ -0,0 +1,123 @@ +import polars as pl +import pytest +from application.predictionengine.src.predictionengine.dataset import DataSet + + +def test_dataset_initialization() -> None: + dataset = DataSet( + batch_size=2, + sequence_length=3, + sample_count=3, + ) + + assert dataset.batch_size == 2 + assert dataset.sequence_length == 3 + assert dataset.sample_count == 3 + assert len(dataset) == 2 + + +def test_dataset_load_data() -> None: + data = pl.DataFrame( + { + "timestamp": [ + "2023-01-01", + "2023-01-02", + "2023-01-03", + "2023-01-01", + "2023-01-02", + "2023-01-03", + ], + "open_price": [100.0, 101.0, 102.0, 50.0, 51.0, 52.0], + "high_price": [110.0, 111.0, 112.0, 60.0, 61.0, 62.0], + "low_price": [90.0, 91.0, 92.0, 40.0, 41.0, 42.0], + "close_price": [105.0, 106.0, 107.0, 55.0, 56.0, 57.0], + "volume": [1000.0, 1100.0, 1200.0, 500.0, 600.0, 700.0], + "volume_weighted_average_price": [105.0, 106.0, 107.0, 55.0, 56.0, 57.0], + "ticker": ["AAPL", "AAPL", "AAPL", "GOOGL", "GOOGL", "GOOGL"], + } + ) + + dataset = DataSet( + batch_size=1, + sequence_length=3, + sample_count=6, + ) + + dataset.load_data(data) + + assert hasattr(dataset, "data") + assert hasattr(dataset, "preprocessors") + assert "indices" in dataset.preprocessors + assert "ticker_encoder" in dataset.preprocessors + + +def test_dataset_get_preprocessors() -> None: + data = pl.DataFrame( + { + "timestamp": ["2023-01-01", "2023-01-02"], + "open_price": [100.0, 101.0], + "high_price": [110.0, 111.0], + "low_price": [90.0, 91.0], + "close_price": [105.0, 106.0], + "volume": [1000.0, 1100.0], + "volume_weighted_average_price": [105.0, 106.0], + "ticker": ["AAPL", "AAPL"], + } + ) + + dataset = DataSet( + batch_size=1, + sequence_length=2, + sample_count=2, + ) + + dataset.load_data(data) + preprocessors = dataset.get_preprocessors() + + assert "means_by_ticker" in preprocessors + assert "standard_deviations_by_ticker" in preprocessors + assert "ticker_encoder" in preprocessors + assert "indices" in preprocessors + + +def test_dataset_batches() -> None: + data = pl.DataFrame( + { + "timestamp": ["2023-01-01", "2023-01-02", "2023-01-03"], + "open_price": [100.0, 101.0, 102.0], + "high_price": [110.0, 111.0, 112.0], + "low_price": [90.0, 91.0, 92.0], + "close_price": [105.0, 106.0, 107.0], + "volume": [1000.0, 1100.0, 1200.0], + "volume_weighted_average_price": [105.0, 106.0, 107.0], + "ticker": ["AAPL", "AAPL", "AAPL"], + } + ) + + dataset = DataSet( + batch_size=1, + sequence_length=2, + sample_count=3, + ) + + dataset.load_data(data) + + batch_count = 0 + for tickers, features, targets in dataset.batches(): + batch_count += 1 + assert tickers.shape[0] == 1 # batch_size + assert features.shape == (1, 2, 6) # batch_size, sequence_length, features + assert targets.shape == (1, 1) # batch_size, 1 + + assert batch_count > 0 + + +def test_dataset_preprocessors_validation() -> None: + dataset = DataSet( + batch_size=1, + sequence_length=2, + sample_count=2, + ) + + with pytest.raises(ValueError, match="Preprocessors have not been initialized"): + dataset.get_preprocessors() diff --git a/application/predictionengine/tests/test_gated_residual_network.py b/application/predictionengine/tests/test_gated_residual_network.py new file mode 100644 index 000000000..617676302 --- /dev/null +++ b/application/predictionengine/tests/test_gated_residual_network.py @@ -0,0 +1,62 @@ +from tinygrad.tensor import Tensor +import numpy as np +from application.predictionengine.src.predictionengine.gated_residual_network import ( + GatedResidualNetwork, +) + + +def test_gated_residual_network_initialization() -> None: + input_size = 64 + hidden_size = 128 + output_size = 32 + + grn = GatedResidualNetwork( + input_size=input_size, + hidden_size=hidden_size, + output_size=output_size, + ) + + assert grn.dense_input.weight.shape == (hidden_size, input_size) + + assert grn.dense_output.weight.shape == (output_size, hidden_size) + + assert grn.gate.weight.shape == (output_size, hidden_size) + + +def test_gated_residual_network_forward() -> None: + grn = GatedResidualNetwork(input_size=32, hidden_size=64, output_size=32) + + input_tensor = Tensor(np.random.randn(8, 32)) + output = grn.forward(input_tensor) + + assert output.shape == (8, 32) + + +def test_gated_residual_network_different_sizes() -> None: + grn = GatedResidualNetwork(input_size=16, hidden_size=32, output_size=8) + + input_tensor = Tensor(np.random.randn(4, 16)) + output = grn.forward(input_tensor) + + assert output.shape == (4, 8) + + +def test_gated_residual_network_single_sample() -> None: + grn = GatedResidualNetwork(input_size=10, hidden_size=20, output_size=10) + + input_tensor = Tensor(np.random.randn(1, 10)) + output = grn.forward(input_tensor) + + assert output.shape == (1, 10) + + +def test_gated_residual_network_consistency() -> None: + grn = GatedResidualNetwork(input_size=16, hidden_size=32, output_size=16) + + input_tensor = Tensor(np.random.randn(2, 16)) + + output1 = grn.forward(input_tensor) + output2 = grn.forward(input_tensor) + + assert output1.shape == output2.shape + assert np.allclose(output1.numpy(), output2.numpy()) diff --git a/application/predictionengine/tests/test_long_short_term_memory.py b/application/predictionengine/tests/test_long_short_term_memory.py new file mode 100644 index 000000000..dd631c1bf --- /dev/null +++ b/application/predictionengine/tests/test_long_short_term_memory.py @@ -0,0 +1,77 @@ +from tinygrad.tensor import Tensor +import numpy as np +from application.predictionengine.src.predictionengine.long_short_term_memory import ( + LongShortTermMemory, +) + + +def test_lstm_initialization() -> None: + lstm = LongShortTermMemory( + input_size=32, hidden_size=64, layer_count=2, dropout_rate=0.1 + ) + + assert lstm.hidden_size == 64 + assert lstm.layer_count == 2 + assert lstm.dropout_rate == 0.1 + + +def test_lstm_forward() -> None: + lstm = LongShortTermMemory( + input_size=16, hidden_size=32, layer_count=1, dropout_rate=0.0 + ) + + input_tensor = Tensor(np.random.randn(4, 10, 16)) + output, hidden_state = lstm.forward(input_tensor) + + assert output.shape == (4, 10, 32) + assert isinstance(hidden_state, tuple) + assert len(hidden_state) == 2 + + +def test_lstm_different_sequence_lengths() -> None: + lstm = LongShortTermMemory( + input_size=8, hidden_size=16, layer_count=1, dropout_rate=0.0 + ) + + for seq_len in [5, 10, 20]: + input_tensor = Tensor(np.random.randn(2, seq_len, 8)) + output, hidden_state = lstm.forward(input_tensor) + + assert output.shape == (2, seq_len, 16) + + +def test_lstm_multiple_layers() -> None: + lstm = LongShortTermMemory( + input_size=10, hidden_size=20, layer_count=3, dropout_rate=0.0 + ) + + input_tensor = Tensor(np.random.randn(2, 5, 10)) + output, hidden_state = lstm.forward(input_tensor) + + assert output.shape == (2, 5, 20) + assert isinstance(hidden_state, tuple) + + +def test_lstm_single_timestep() -> None: + lstm = LongShortTermMemory( + input_size=12, hidden_size=24, layer_count=1, dropout_rate=0.0 + ) + + input_tensor = Tensor(np.random.randn(3, 1, 12)) + output, _ = lstm.forward(input_tensor) + + assert output.shape == (3, 1, 24) + + +def test_lstm_consistency() -> None: + lstm = LongShortTermMemory( + input_size=6, hidden_size=12, layer_count=1, dropout_rate=0.0 + ) + + input_tensor = Tensor(np.random.randn(1, 3, 6)) + + first_output, _ = lstm.forward(input_tensor) + second_output, _ = lstm.forward(input_tensor) + + assert first_output.shape == second_output.shape + assert np.allclose(first_output.numpy(), second_output.numpy(), rtol=1e-5) diff --git a/application/predictionengine/tests/test_loss_function.py b/application/predictionengine/tests/test_loss_function.py new file mode 100644 index 000000000..512ebda43 --- /dev/null +++ b/application/predictionengine/tests/test_loss_function.py @@ -0,0 +1,58 @@ +from tinygrad.tensor import Tensor +import numpy as np +from application.predictionengine.src.predictionengine.loss_function import ( + quantile_loss, +) + + +def test_quantile_loss_basic() -> None: + predictions = Tensor([[1.0, 2.0, 3.0]]) + targets = Tensor([[2.0]]) + quantiles = (0.25, 0.5, 0.75) + + loss = quantile_loss(predictions, targets, quantiles) + + assert isinstance(loss, Tensor) + assert loss.shape == () or loss.shape == (1,) + + +def test_quantile_loss_multiple_samples() -> None: + predictions = Tensor([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]]) + targets = Tensor([[2.5], [5.5]]) + quantiles = (0.25, 0.5, 0.75) + + loss = quantile_loss(predictions, targets, quantiles) + + assert isinstance(loss, Tensor) + assert loss.shape == () or loss.shape == (1,) + + +def test_quantile_loss_perfect_prediction() -> None: + predictions = Tensor([[1.0, 2.0, 3.0]]) + targets = Tensor([[2.0]]) # matches median prediction + quantiles = (0.25, 0.5, 0.75) + + loss = quantile_loss(predictions, targets, quantiles) + + assert loss.numpy() >= 0.0 + + +def test_quantile_loss_different_quantiles() -> None: + predictions = Tensor([[1.0, 2.0, 3.0, 4.0, 5.0]]) # 5 quantiles + targets = Tensor([[3.0]]) + quantiles = (0.1, 0.25, 0.5, 0.75, 0.9) + + loss = quantile_loss(predictions, targets, quantiles) + + assert isinstance(loss, Tensor) + assert loss.numpy() >= 0.0 + + +def test_quantile_loss_shapes() -> None: + for batch_size in [1, 2, 4, 8]: + predictions = Tensor(np.random.randn(batch_size, 3).astype(np.float32)) + targets = Tensor(np.random.randn(batch_size, 1).astype(np.float32)) + quantiles = (0.25, 0.5, 0.75) + + loss = quantile_loss(predictions, targets, quantiles) + assert isinstance(loss, Tensor) diff --git a/application/predictionengine/tests/test_multi_head_self_attention.py b/application/predictionengine/tests/test_multi_head_self_attention.py new file mode 100644 index 000000000..d818eea00 --- /dev/null +++ b/application/predictionengine/tests/test_multi_head_self_attention.py @@ -0,0 +1,67 @@ +from tinygrad.tensor import Tensor +import numpy as np +from application.predictionengine.src.predictionengine.multi_head_self_attention import ( + MultiHeadSelfAttention, +) + + +def test_multi_head_attention_initialization() -> None: + attention = MultiHeadSelfAttention(heads_count=8, embedding_size=64) + + assert attention.heads_count == 8 + assert attention.embedding_size == 64 + + +def test_multi_head_attention_forward() -> None: + attention = MultiHeadSelfAttention(heads_count=4, embedding_size=32) + + input_tensor = Tensor(np.random.randn(2, 10, 32)) + output, attention_weights = attention.forward(input_tensor) + + assert output.shape == (2, 10, 32) + assert attention_weights.shape[0] == 2 # batch size + assert attention_weights.shape[1] == 4 # heads count + + +def test_multi_head_attention_different_heads() -> None: + for heads_count in [1, 2, 4, 8]: + embedding_size = 32 + attention = MultiHeadSelfAttention( + heads_count=heads_count, embedding_size=embedding_size + ) + + input_tensor = Tensor(np.random.randn(1, 5, embedding_size)) + output, attention_weights = attention.forward(input_tensor) + + assert output.shape == (1, 5, embedding_size) + assert attention_weights.shape[1] == heads_count + + +def test_multi_head_attention_single_sequence() -> None: + attention = MultiHeadSelfAttention(heads_count=2, embedding_size=16) + + input_tensor = Tensor(np.random.randn(1, 1, 16)) + output, _ = attention.forward(input_tensor) + + assert output.shape == (1, 1, 16) + + +def test_multi_head_attention_longer_sequences() -> None: + attention = MultiHeadSelfAttention(heads_count=4, embedding_size=64) + + for seq_len in [10, 20, 50]: + input_tensor = Tensor(np.random.randn(1, seq_len, 64)) + output, _ = attention.forward(input_tensor) + + assert output.shape == (1, seq_len, 64) + + +def test_multi_head_attention_batch_processing() -> None: + attention = MultiHeadSelfAttention(heads_count=2, embedding_size=32) + + for batch_size in [1, 2, 4, 8]: + input_tensor = Tensor(np.random.randn(batch_size, 5, 32)) + output, attention_weights = attention.forward(input_tensor) + + assert output.shape == (batch_size, 5, 32) + assert attention_weights.shape[0] == batch_size diff --git a/application/predictionengine/tests/test_post_processor.py b/application/predictionengine/tests/test_post_processor.py new file mode 100644 index 000000000..0caf8c0cb --- /dev/null +++ b/application/predictionengine/tests/test_post_processor.py @@ -0,0 +1,104 @@ +from category_encoders import OrdinalEncoder +import polars as pl +from tinygrad.tensor import Tensor +import numpy as np +from application.predictionengine.src.predictionengine.post_processor import ( + PostProcessor, +) + + +def test_post_processor_initialization() -> None: + ticker_encoder = OrdinalEncoder(cols=["ticker"]) + means_by_ticker = {"AAPL": Tensor([150.0])} + standard_deviations_by_ticker = {"AAPL": Tensor([5.0])} + + post_processor = PostProcessor( + means_by_ticker=means_by_ticker, + standard_deviations_by_ticker=standard_deviations_by_ticker, + ticker_encoder=ticker_encoder, + ) + + assert post_processor.means_by_ticker == means_by_ticker + assert post_processor.standard_deviations_by_ticker == standard_deviations_by_ticker + assert post_processor.ticker_encoder == ticker_encoder + + +def test_post_processor_predictions() -> None: + tickers = ["AAPL", "GOOGL"] + + ticker_encoder = OrdinalEncoder(cols=["ticker"]) + ticker_encoder.fit(pl.DataFrame({"ticker": tickers}).to_pandas()) + + means_by_ticker = { + "AAPL": Tensor([150.0]), + "GOOGL": Tensor([2800.0]), + } + + standard_deviations_by_ticker = { + "AAPL": Tensor([5.0]), + "GOOGL": Tensor([50.0]), + } + + encoded_tickers = ticker_encoder.transform( + pl.DataFrame({"ticker": tickers}).to_pandas() + )["ticker"].to_numpy() + + predictions = np.array( + [ + [0.0, 1.0, -1.0], # AAPL: mean, +1std, -1std + [0.0, 1.0, -1.0], # GOOGL: mean, +1std, -1std + ] + ) + + post_processor = PostProcessor( + means_by_ticker=means_by_ticker, + standard_deviations_by_ticker=standard_deviations_by_ticker, + ticker_encoder=ticker_encoder, + ) + + percentile_25, percentile_50, percentile_75 = ( + post_processor.post_process_predictions( + encoded_tickers=encoded_tickers, + predictions=predictions, + ) + ) + + assert isinstance(percentile_25, np.ndarray) + assert isinstance(percentile_50, np.ndarray) + assert isinstance(percentile_75, np.ndarray) + assert len(percentile_25) == 2 + assert len(percentile_50) == 2 + assert len(percentile_75) == 2 + + assert np.all(percentile_25 <= percentile_50) + assert np.all(percentile_50 <= percentile_75) + + +def test_post_processor_single_ticker() -> None: + ticker_encoder = OrdinalEncoder(cols=["ticker"]) + ticker_encoder.fit(pl.DataFrame({"ticker": ["AAPL"]}).to_pandas()) + + means_by_ticker = {"AAPL": Tensor([100.0])} + standard_deviations_by_ticker = {"AAPL": Tensor([10.0])} + + encoded_tickers = ticker_encoder.transform( + pl.DataFrame({"ticker": ["AAPL"]}).to_pandas() + )["ticker"].to_numpy() + predictions = np.array([[0.5, 1.0, 1.5]]) # single prediction + + post_processor = PostProcessor( + means_by_ticker=means_by_ticker, + standard_deviations_by_ticker=standard_deviations_by_ticker, + ticker_encoder=ticker_encoder, + ) + + percentile_25, percentile_50, percentile_75 = ( + post_processor.post_process_predictions( + encoded_tickers=encoded_tickers, + predictions=predictions, + ) + ) + + assert len(percentile_25) == 1 + assert len(percentile_50) == 1 + assert len(percentile_75) == 1 diff --git a/application/predictionengine/tests/test_ticker_embedding.py b/application/predictionengine/tests/test_ticker_embedding.py new file mode 100644 index 000000000..abb70d349 --- /dev/null +++ b/application/predictionengine/tests/test_ticker_embedding.py @@ -0,0 +1,55 @@ +from tinygrad.tensor import Tensor +from application.predictionengine.src.predictionengine.ticker_embedding import ( + TickerEmbedding, +) + + +def test_ticker_embedding_initialization() -> None: + embedding = TickerEmbedding(ticker_count=100, embedding_size=32) + + assert hasattr(embedding, "embedding") + + +def test_ticker_embedding_forward() -> None: + embedding = TickerEmbedding(ticker_count=10, embedding_size=16) + + ticker_ids: Tensor = Tensor([1]) + result: Tensor = embedding.forward(ticker_ids) + + assert result.shape == (1, 16) + + ticker_ids = Tensor([1, 2, 3]) + result = embedding.forward(ticker_ids) + + assert result.shape == (3, 16) + + +def test_ticker_embedding_different_sizes() -> None: + for embedding_size in [8, 16, 32, 64]: + embedding = TickerEmbedding(ticker_count=50, embedding_size=embedding_size) + ticker_ids = Tensor([0, 1, 2]) + result = embedding.forward(ticker_ids) + + assert result.shape == (3, embedding_size) + + +def test_ticker_embedding_range() -> None: + embedding = TickerEmbedding(ticker_count=5, embedding_size=8) + + for ticker_id in range(5): + ticker_ids = Tensor([ticker_id]) + result = embedding.forward(ticker_ids) + assert result.shape == (1, 8) + + +def test_ticker_embedding_consistency() -> None: + embedding = TickerEmbedding(ticker_count=10, embedding_size=3) + tickers = Tensor([1, 2, 3]) + + result = embedding.forward(tickers) + + assert result.shape == (3, 3) + + for _, ticker_id in enumerate([1, 2, 3]): + individual_result = embedding.forward(Tensor([ticker_id])) + assert individual_result.shape == (1, 3) diff --git a/compose.yaml b/compose.yaml index 19428f14e..e668fe1c9 100644 --- a/compose.yaml +++ b/compose.yaml @@ -1,13 +1,16 @@ --- -name: Application unit tests +name: pocketsizefund-tests services: tests: + image: pocketsizefund-tests:latest build: context: . dockerfile: Dockerfile.test + tags: + - pocketsizefund-tests:latest volumes: - - .:/tests + - ./coverage_output:/tests/coverage_output # mounted for coverage output command: - /bin/sh - -euxc @@ -15,4 +18,4 @@ services: uv run coverage run --parallel-mode -m pytest && \ uv run coverage combine && \ uv run coverage report && \ - uv run coverage xml + uv run coverage xml -o /tests/coverage_output/.python_coverage.xml diff --git a/infrastructure/cloud_run.py b/infrastructure/cloud_run.py index 2ded5b0ae..5cdeec464 100644 --- a/infrastructure/cloud_run.py +++ b/infrastructure/cloud_run.py @@ -56,7 +56,7 @@ cloudrun.ServiceTemplateSpecContainerEnvArgs( name="DUCKDB_SECRET", value=duckdb_secret, - ), + ), ], ) ], diff --git a/pyproject.toml b/pyproject.toml index 602c3a45f..b7a05e1fb 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -3,25 +3,20 @@ name = "pocketsizefund" version = "0.1.0" description = "Open source quantitative hedge fund 🍊" requires-python = ">=3.12" -dependencies = [ - "flytekit>=1.15.4", - "httpx>=0.28.1", - "pulumi-docker-build>=0.0.12", - "pulumi-gcp>=8.32.0", - "requests>=2.32.3", -] [tool.uv.workspace] members = [ "infrastructure", "application/datamanager", "application/positionmanager", + "application/predictionengine", "workflows", ] [tool.uv.sources] datamanager = { workspace = true } positionmanager = { workspace = true } +predictionengine = { workspace = true } workflows = { workspace = true } [tool.hatch.build.targets.wheel] @@ -33,26 +28,30 @@ build-backend = "hatchling.build" [dependency-groups] dev = [ - "alpaca-py>=0.40.1", "coverage>=7.8.0", - "duckdb>=1.2.2", - "ecos>=2.0.14", - "fastapi>=0.115.12", - "polars>=1.29.0", - "pyarrow>=20.0.0", - "pyportfolioopt>=1.5.6", "pytest>=8.3.5", - "requests>=2.32.3", "behave>=1.2.6", - "prometheus-fastapi-instrumentator>=7.1.0", - "loguru>=0.7.3", ] [tool.pytest.ini_options] testpaths = [ - "/tests/", "application/datamanager/tests", - "application/positionmanager/tests", + "application/positionmanager/tests", + "application/predictionengine/tests", +] +python_files = ["test_*.py"] +python_classes = ["Test*"] +python_functions = ["test_*"] +addopts = [ + "--verbose", + "--tb=short", + "--strict-markers", + "--color=yes", + "--rootdir=/tests" +] +filterwarnings = [ + "ignore::DeprecationWarning:websockets.legacy.*", + "ignore::DeprecationWarning", ] [tool.coverage.run] @@ -64,7 +63,7 @@ show_missing = true skip_covered = true [tool.coverage.xml] -output = ".python_coverage.xml" +output = "coverage_output/.python_coverage.xml" [tool.ruff] select = [ diff --git a/todos.md b/todos.md new file mode 100644 index 000000000..8dc0d791e --- /dev/null +++ b/todos.md @@ -0,0 +1,262 @@ +# Consider validation for required configuration fields. + +While defaulting to empty strings provides type consistency, it may mask configuration errors. Consider adding validation to ensure critical fields like API keys and project identifiers are properly configured. + +Also note the inconsistency: credentials_path on line 24 still uses the old pattern without an empty string default. + +```diff + class GCP(BaseModel): + bucket: Bucket = Bucket() +- credentials_path: str = os.getenv("GOOGLE_APPLICATION_CREDENTIALS", "") ++ credentials_path: str = Field(default=os.getenv("GOOGLE_APPLICATION_CREDENTIALS", "")) +``` +Consider adding validation to ensure required fields are non-empty: + +```python +from pydantic import BaseModel, Field, field_validator + +class Polygon(BaseModel): + api_key: str = Field(default=os.getenv("POLYGON_API_KEY", "")) + + @field_validator('api_key') + @classmethod + def validate_api_key(cls, v): + if not v: + raise ValueError('POLYGON_API_KEY environment variable is required') + return v +``` + +In application/datamanager/src/datamanager/config.py at line 8 and also lines +14-15 and 24, the current use of default empty strings for critical config +fields like api_key and credentials_path can hide missing configuration errors. +To fix this, add pydantic field validators for these fields that check if the +value is non-empty and raise a ValueError if not. This ensures required +environment variables are properly set and prevents silent misconfiguration. + +# Move index computation after data transformations. + +The column indices are captured too early, before the data undergoes significant transformations (joins, new columns, reordering). This causes the indices to become stale. + +Move the index computation after the data is finalized: + +```diff +- self.preprocessors["indices"] = { +- col: idx for idx, col in enumerate(data.columns) +- } +- + data = data.unique(subset=["ticker", "timestamp"]) + # ... all transformations ... + data = data.sort(["ticker", "timestamp"]) ++ ++ # Compute indices after all transformations ++ self.preprocessors["indices"] = { ++ col: idx for idx, col in enumerate(data.columns) ++ } +``` + +Also applies to: 84-85 + +In application/predictionengine/src/predictionengine/dataset.py at lines 48-50 +and also lines 84-85, the computation of column indices is done too early before +the data transformations like joins, adding new columns, or reordering. This +causes the stored indices to be outdated. To fix this, move the code that sets +self.preprocessors["indices"] to after all data transformations are complete, +ensuring the indices reflect the final state of the data columns. + +# Fix incorrect scalers check and remove redundant type annotation. + +The condition self.scalers is None will never be true since self.scalers is initialized as {} in __init__. Also, re-declaring the instance variable with a type annotation is unnecessary and confusing. + +```diff +- if self.scalers is None or len(self.scalers) == 0: +- self.scalers: Dict[str, Dict[str, Tensor]] = {} ++ if len(self.scalers) == 0: +``` + +In application/predictionengine/src/predictionengine/dataset.py around lines 117 +to 118, remove the check for `self.scalers is None` since `self.scalers` is +always initialized as an empty dictionary in `__init__`. Also, eliminate the +redundant type annotation on `self.scalers` when assigning it to an empty +dictionary to avoid confusion and maintain clarity. + +# Fix inconsistent None check for required parameter. + +The parameter output_size is declared as int (required), but line 12 checks if it's None. This is inconsistent since a required int parameter cannot be None. + +```diff +- output_size: int, ++ output_size: int = None, +``` + +Or if output_size should always be provided, remove the None check: +``` +- output_size = output_size if output_size is not None else input_size ++ # output_size is always provided as per function signature +``` + +In application/predictionengine/src/predictionengine/gated_residual_network.py +at line 12, the code checks if the required int parameter output_size is None, +which is inconsistent. To fix this, either remove the None check entirely if +output_size must always be provided, or update the function signature to make +output_size optional by setting a default value of None if the None check is +necessary. + +# Simplify the batch iteration logic. +Since the dataset is configured with batch_size=1 and sample_count=1, there will only be one batch. The nested loop structure is unnecessary and confusing. + + +```diff +- for tickers_batch, features_batch, _ in dataset.batches(): +- percentile_25, percentile_50, percentile_75 = model.predict( +- tickers_batch, features_batch +- ) +- +- predictions[ticker] = { +- "percentile_25": float(percentile_25[0]), +- "percentile_50": float(percentile_50[0]), +- "percentile_75": float(percentile_75[0]), +- } +- break ++ # Get the single batch ++ batch_generator = dataset.batches() ++ tickers_batch, features_batch, _ = next(batch_generator) ++ ++ percentile_25, percentile_50, percentile_75 = model.predict( ++ tickers_batch, features_batch ++ ) ++ ++ predictions[ticker] = { ++ "percentile_25": float(percentile_25[0]), ++ "percentile_50": float(percentile_50[0]), ++ "percentile_75": float(percentile_75[0]), ++ } +``` + +In application/predictionengine/src/predictionengine/main.py around lines 136 to +147, the code uses a for loop to iterate over dataset batches, but since +batch_size and sample_count are both 1, there is only one batch. Replace the for +loop with a direct call to get the single batch from the dataset, then perform +the prediction and assignment without looping, simplifying the logic and +improving readability. + +# Quantile loss shape handling is correct; add value-based assertions + +The quantile_loss implementation uses broadcasting (error = y_true - y_pred) and .mean() to reduce to a scalar (0-D Tensor) or a single-element Tensor, so the existing shape assertions (() or (1,)) are valid. +To improve test robustness, add explicit assertions on the numeric value of the loss for known inputs. For example, compute the expected loss by hand for a small input and confirm quantile_loss matches it. +Consider covering additional edge cases: +Perfect prediction (loss == 0) +Negative vs positive errors +Single-quantile scenarios + +In application/predictionengine/tests/test_loss_function.py around lines 8 to +17, the test currently only checks the type and shape of the quantile_loss +output but does not verify its numeric correctness. Enhance the test by adding +explicit assertions that compare the computed loss value against expected values +calculated manually for the given inputs. Also, add additional test cases +covering perfect predictions (where loss should be zero), scenarios with +negative and positive errors, and tests with a single quantile to ensure +comprehensive coverage of quantile_loss behavior. + +# Fix shape mismatch between LSTM output and self-attention input + +The LSTM encoder’s forward returns only the last hidden state of shape (batch_size, hidden_size), but MultiHeadSelfAttention.forward unpacks its input as + +batch_size, sequence_length, _ = input.shape +which requires a 3-D tensor (batch_size, seq_len, embedding_size). + +You must update one of the following: + +Have the LSTM return the full sequence of hidden states (e.g. collect layer_hidden_state at each time step) so that lstm_output is (batch_size, seq_len, hidden_size) before passing through the GRN and self-attention layers. +Or replace/adjust the self-attention module (or use a different attention mechanism) that accepts a 2-D input of shape (batch_size, hidden_size). +Locations to address: + +application/predictionengine/src/predictionengine/miniature_temporal_fusion_transformer.py +lines 86–90 (the call to self.self_attention.forward(processed_features)) + +In +application/predictionengine/src/predictionengine/miniature_temporal_fusion_transformer.py +around lines 84 to 90, the LSTM encoder's forward method returns only the last +hidden state with shape (batch_size, hidden_size), but the self-attention layer +expects input of shape (batch_size, sequence_length, embedding_size). To fix +this, modify the LSTM encoder to return the full sequence of hidden states so +that lstm_output has shape (batch_size, seq_len, hidden_size), or alternatively +replace or adjust the self-attention layer to accept a 2-D input of shape +(batch_size, hidden_size). Ensure that the processed_features passed to +self.self_attention.forward have the correct 3-D shape if keeping the +self-attention layer. + + +# Fix the incorrect batch iteration logic and typo. + +The nested loop structure is incorrect. dataset.batches() returns individual batches, not a collection to iterate over. Also, there's a typo in the variable name and the loss is being appended in the wrong place. + +```diff +- for batch in dataset.batches(): +- for tickers, historical_features, targets in batch: +- predictions, _, _ = self.forward( +- Tensor(tickers), +- Tensor(historical_features), +- ) +- +- loss = quantile_loss(predictions, Tensor(targets), quantiles) +- +- optimizer.zero_grad() +- loss.backward() +- optimizer.step() +- +- epoch_loss += loss.numpy().item() +- +- avgerage_epoch_loss = epoch_loss / len(dataset) +- losses.append(avgerage_epoch_loss) ++ batch_count = 0 ++ for tickers, historical_features, targets in dataset.batches(): ++ predictions, _, _ = self.forward( ++ Tensor(tickers), ++ Tensor(historical_features), ++ ) ++ ++ loss = quantile_loss(predictions, Tensor(targets), quantiles) ++ ++ optimizer.zero_grad() ++ loss.backward() ++ optimizer.step() ++ ++ epoch_loss += loss.numpy().item() ++ batch_count += 1 ++ ++ average_epoch_loss = epoch_loss / batch_count ++ losses.append(average_epoch_loss) +``` + +# Fix the batch iteration in validation method. + +The batch iteration has the same issue as in the train method. Also, len(dataset) might not return the correct number of batches. + +```diff + def validate( + self, + dataset: DataSet, + ) -> float: + total_loss = 0.0 +- batch_count = len(dataset) ++ batch_count = 0 + +- for batch in dataset.batches(): +- tickers, features, targets = batch ++ for tickers, features, targets in dataset.batches(): + tickers, features, targets = ( + Tensor(tickers), + Tensor(features), + Tensor(targets), + ) ++ batch_count += 1 +``` + +In +application/predictionengine/src/predictionengine/miniature_temporal_fusion_transformer.py +around lines 142 to 150, the batch iteration in the validation method +incorrectly uses len(dataset) for batch count and iterates over +dataset.batches() similarly to the train method, which is problematic. Fix this +by using the correct method or property to get the number of batches and ensure +the batch iteration matches the correct approach used in the train method, +properly unpacking and converting batch elements to Tensors. diff --git a/uv.lock b/uv.lock index 9db92bfff..dcc9b8955 100644 --- a/uv.lock +++ b/uv.lock @@ -8,6 +8,7 @@ members = [ "infrastructure", "pocketsizefund", "positionmanager", + "predictionengine", "workflows", ] @@ -263,6 +264,23 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/72/76/20fa66124dbe6be5cafeb312ece67de6b61dd91a0247d1ea13db4ebb33c2/cachetools-5.5.2-py3-none-any.whl", hash = "sha256:d26a22bcc62eb95c3beabd9f1ee5e820d3d2704fe2967cbe350e20c8ffcd3f0a", size = 10080, upload-time = "2025-02-20T21:01:16.647Z" }, ] +[[package]] +name = "category-encoders" +version = "2.8.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "numpy" }, + { name = "pandas" }, + { name = "patsy" }, + { name = "scikit-learn" }, + { name = "scipy" }, + { name = "statsmodels" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/89/32/f5dff088ebae54d5464124298a9262bdd07c55558ba7e9b961be7ecdb0e6/category_encoders-2.8.1.tar.gz", hash = "sha256:57af8a23bde3cf622ee7e17c11547011795e4d337839d40cbd16c36b67291b33", size = 57856, upload-time = "2025-03-15T16:17:23.176Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/92/fb/908cb215a30b117bb079a767176038599a5447f2506e21aa2e90d0aabfff/category_encoders-2.8.1-py3-none-any.whl", hash = "sha256:ba77bde0a0afe13732b04997635b1ae82569e42c9696bc361c68674197988303", size = 85710, upload-time = "2025-03-15T16:17:21.839Z" }, +] + [[package]] name = "certifi" version = "2025.4.26" @@ -488,16 +506,11 @@ dependencies = [ { name = "httpx" }, { name = "loguru" }, { name = "polars" }, + { name = "prometheus-fastapi-instrumentator" }, { name = "pyarrow" }, { name = "uvicorn" }, ] -[package.dev-dependencies] -dev = [ - { name = "behave" }, - { name = "requests" }, -] - [package.metadata] requires-dist = [ { name = "datamanager", editable = "application/datamanager" }, @@ -507,16 +520,11 @@ requires-dist = [ { name = "httpx", specifier = ">=0.28.1" }, { name = "loguru", specifier = ">=0.7.3" }, { name = "polars", specifier = ">=1.29.0" }, + { name = "prometheus-fastapi-instrumentator", specifier = ">=7.1.0" }, { name = "pyarrow", specifier = ">=20.0.0" }, { name = "uvicorn", specifier = ">=0.34.2" }, ] -[package.metadata.requires-dev] -dev = [ - { name = "behave", specifier = ">=1.2.6" }, - { name = "requests", specifier = ">=2.31.0" }, -] - [[package]] name = "debugpy" version = "1.8.14" @@ -1441,6 +1449,18 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/0f/4c/f98024021bef4d44dce3613feebd702c7ad8883f777ff8488384c59e9774/parver-0.5-py3-none-any.whl", hash = "sha256:2281b187276c8e8e3c15634f62287b2fb6fe0efe3010f739a6bd1e45fa2bf2b2", size = 15172, upload-time = "2023-10-03T21:06:52.796Z" }, ] +[[package]] +name = "patsy" +version = "1.0.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "numpy" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/d1/81/74f6a65b848ffd16c18f920620ce999fe45fe27f01ab3911260ce4ed85e4/patsy-1.0.1.tar.gz", hash = "sha256:e786a9391eec818c054e359b737bbce692f051aee4c661f4141cc88fb459c0c4", size = 396010, upload-time = "2024-11-12T14:10:54.642Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/87/2b/b50d3d08ea0fc419c183a84210571eba005328efa62b6b98bc28e9ead32a/patsy-1.0.1-py2.py3-none-any.whl", hash = "sha256:751fb38f9e97e62312e921a1954b81e1bb2bcda4f5eeabaf94db251ee791509c", size = 232923, upload-time = "2024-11-12T14:10:52.85Z" }, +] + [[package]] name = "pip" version = "25.1.1" @@ -1476,55 +1496,21 @@ wheels = [ name = "pocketsizefund" version = "0.1.0" source = { editable = "." } -dependencies = [ - { name = "flytekit" }, - { name = "httpx" }, - { name = "pulumi-docker-build" }, - { name = "pulumi-gcp" }, - { name = "requests" }, -] [package.dev-dependencies] dev = [ - { name = "alpaca-py" }, { name = "behave" }, { name = "coverage" }, - { name = "duckdb" }, - { name = "ecos" }, - { name = "fastapi" }, - { name = "loguru" }, - { name = "polars" }, - { name = "prometheus-fastapi-instrumentator" }, - { name = "pyarrow" }, - { name = "pyportfolioopt" }, { name = "pytest" }, - { name = "requests" }, ] [package.metadata] -requires-dist = [ - { name = "flytekit", specifier = ">=1.15.4" }, - { name = "httpx", specifier = ">=0.28.1" }, - { name = "pulumi-docker-build", specifier = ">=0.0.12" }, - { name = "pulumi-gcp", specifier = ">=8.32.0" }, - { name = "requests", specifier = ">=2.32.3" }, -] [package.metadata.requires-dev] dev = [ - { name = "alpaca-py", specifier = ">=0.40.1" }, { name = "behave", specifier = ">=1.2.6" }, { name = "coverage", specifier = ">=7.8.0" }, - { name = "duckdb", specifier = ">=1.2.2" }, - { name = "ecos", specifier = ">=2.0.14" }, - { name = "fastapi", specifier = ">=0.115.12" }, - { name = "loguru", specifier = ">=0.7.3" }, - { name = "polars", specifier = ">=1.29.0" }, - { name = "prometheus-fastapi-instrumentator", specifier = ">=7.1.0" }, - { name = "pyarrow", specifier = ">=20.0.0" }, - { name = "pyportfolioopt", specifier = ">=1.5.6" }, { name = "pytest", specifier = ">=8.3.5" }, - { name = "requests", specifier = ">=2.32.3" }, ] [[package]] @@ -1570,6 +1556,32 @@ requires-dist = [ { name = "uvicorn", specifier = ">=0.34.2" }, ] +[[package]] +name = "predictionengine" +version = "0.1.0" +source = { editable = "application/predictionengine" } +dependencies = [ + { name = "category-encoders" }, + { name = "fastapi" }, + { name = "polars" }, + { name = "pydantic" }, + { name = "requests" }, + { name = "tinygrad" }, + { name = "uvicorn" }, +] + +[package.metadata] +requires-dist = [ + { name = "category-encoders", specifier = ">=2.8.1" }, + { name = "fastapi", specifier = ">=0.115.12" }, + { name = "polars", specifier = ">=1.29.0" }, + { name = "predictionengine", editable = "application/predictionengine" }, + { name = "pydantic", specifier = ">=2.8.2" }, + { name = "requests", specifier = ">=2.31.0" }, + { name = "tinygrad", specifier = ">=0.10.3" }, + { name = "uvicorn", specifier = ">=0.34.2" }, +] + [[package]] name = "prometheus-client" version = "0.22.0" @@ -2041,6 +2053,29 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/d5/c0/f5cc95ec88694429fcb841a37456be0a27463bc39d43edbd36e3164120ed/s3fs-2025.5.1-py3-none-any.whl", hash = "sha256:7475e7c40a3a112f17144907ffae50782ab6c03487fe0b45a9c3942bb7a5c606", size = 30476, upload-time = "2025-05-24T12:14:10.056Z" }, ] +[[package]] +name = "scikit-learn" +version = "1.6.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "joblib" }, + { name = "numpy" }, + { name = "scipy" }, + { name = "threadpoolctl" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/9e/a5/4ae3b3a0755f7b35a280ac90b28817d1f380318973cff14075ab41ef50d9/scikit_learn-1.6.1.tar.gz", hash = "sha256:b4fc2525eca2c69a59260f583c56a7557c6ccdf8deafdba6e060f94c1c59738e", size = 7068312, upload-time = "2025-01-10T08:07:55.348Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/2e/59/8eb1872ca87009bdcdb7f3cdc679ad557b992c12f4b61f9250659e592c63/scikit_learn-1.6.1-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:2ffa1e9e25b3d93990e74a4be2c2fc61ee5af85811562f1288d5d055880c4322", size = 12010001, upload-time = "2025-01-10T08:06:58.613Z" }, + { url = "https://files.pythonhosted.org/packages/9d/05/f2fc4effc5b32e525408524c982c468c29d22f828834f0625c5ef3d601be/scikit_learn-1.6.1-cp313-cp313-macosx_12_0_arm64.whl", hash = "sha256:dc5cf3d68c5a20ad6d571584c0750ec641cc46aeef1c1507be51300e6003a7e1", size = 11096360, upload-time = "2025-01-10T08:07:01.556Z" }, + { url = "https://files.pythonhosted.org/packages/c8/e4/4195d52cf4f113573fb8ebc44ed5a81bd511a92c0228889125fac2f4c3d1/scikit_learn-1.6.1-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c06beb2e839ecc641366000ca84f3cf6fa9faa1777e29cf0c04be6e4d096a348", size = 12209004, upload-time = "2025-01-10T08:07:06.931Z" }, + { url = "https://files.pythonhosted.org/packages/94/be/47e16cdd1e7fcf97d95b3cb08bde1abb13e627861af427a3651fcb80b517/scikit_learn-1.6.1-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e8ca8cb270fee8f1f76fa9bfd5c3507d60c6438bbee5687f81042e2bb98e5a97", size = 13171776, upload-time = "2025-01-10T08:07:11.715Z" }, + { url = "https://files.pythonhosted.org/packages/34/b0/ca92b90859070a1487827dbc672f998da95ce83edce1270fc23f96f1f61a/scikit_learn-1.6.1-cp313-cp313-win_amd64.whl", hash = "sha256:7a1c43c8ec9fde528d664d947dc4c0789be4077a3647f232869f41d9bf50e0fb", size = 11071865, upload-time = "2025-01-10T08:07:16.088Z" }, + { url = "https://files.pythonhosted.org/packages/12/ae/993b0fb24a356e71e9a894e42b8a9eec528d4c70217353a1cd7a48bc25d4/scikit_learn-1.6.1-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:a17c1dea1d56dcda2fac315712f3651a1fea86565b64b48fa1bc090249cbf236", size = 11955804, upload-time = "2025-01-10T08:07:20.385Z" }, + { url = "https://files.pythonhosted.org/packages/d6/54/32fa2ee591af44507eac86406fa6bba968d1eb22831494470d0a2e4a1eb1/scikit_learn-1.6.1-cp313-cp313t-macosx_12_0_arm64.whl", hash = "sha256:6a7aa5f9908f0f28f4edaa6963c0a6183f1911e63a69aa03782f0d924c830a35", size = 11100530, upload-time = "2025-01-10T08:07:23.675Z" }, + { url = "https://files.pythonhosted.org/packages/3f/58/55856da1adec655bdce77b502e94a267bf40a8c0b89f8622837f89503b5a/scikit_learn-1.6.1-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0650e730afb87402baa88afbf31c07b84c98272622aaba002559b614600ca691", size = 12433852, upload-time = "2025-01-10T08:07:26.817Z" }, + { url = "https://files.pythonhosted.org/packages/ff/4f/c83853af13901a574f8f13b645467285a48940f185b690936bb700a50863/scikit_learn-1.6.1-cp313-cp313t-win_amd64.whl", hash = "sha256:3f59fe08dc03ea158605170eb52b22a105f238a5d512c4470ddeca71feae8e5f", size = 11337256, upload-time = "2025-01-10T08:07:31.084Z" }, +] + [[package]] name = "scipy" version = "1.15.3" @@ -2167,6 +2202,27 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/f4/d0/c9543b52c067a390ae6ae632d7fd1b97a35cdc8d69d40c0b7d334b326410/statsd-4.0.1-py2.py3-none-any.whl", hash = "sha256:c2676519927f7afade3723aca9ca8ea986ef5b059556a980a867721ca69df093", size = 13118, upload-time = "2022-11-06T14:17:34.258Z" }, ] +[[package]] +name = "statsmodels" +version = "0.14.4" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "numpy" }, + { name = "packaging" }, + { name = "pandas" }, + { name = "patsy" }, + { name = "scipy" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/1f/3b/963a015dd8ea17e10c7b0e2f14d7c4daec903baf60a017e756b57953a4bf/statsmodels-0.14.4.tar.gz", hash = "sha256:5d69e0f39060dc72c067f9bb6e8033b6dccdb0bae101d76a7ef0bcc94e898b67", size = 20354802, upload-time = "2024-10-03T16:15:36.273Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/31/f8/2662e6a101315ad336f75168fa9bac71f913ebcb92a6be84031d84a0f21f/statsmodels-0.14.4-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:b5a24f5d2c22852d807d2b42daf3a61740820b28d8381daaf59dcb7055bf1a79", size = 10186886, upload-time = "2024-10-03T17:10:44.074Z" }, + { url = "https://files.pythonhosted.org/packages/fa/c0/ee6e8ed35fc1ca9c7538c592f4974547bf72274bc98db1ae4a6e87481a83/statsmodels-0.14.4-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:df4f7864606fa843d7e7c0e6af288f034a2160dba14e6ccc09020a3cf67cb092", size = 9880066, upload-time = "2024-10-03T17:10:56.972Z" }, + { url = "https://files.pythonhosted.org/packages/d1/97/3380ca6d8fd66cfb3d12941e472642f26e781a311c355a4e97aab2ed0216/statsmodels-0.14.4-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:91341cbde9e8bea5fb419a76e09114e221567d03f34ca26e6d67ae2c27d8fe3c", size = 10283521, upload-time = "2024-10-03T17:14:06.216Z" }, + { url = "https://files.pythonhosted.org/packages/fe/2a/55c5b5c5e5124a202ea3fe0bcdbdeceaf91b4ec6164b8434acb9dd97409c/statsmodels-0.14.4-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1322286a7bfdde2790bf72d29698a1b76c20b8423a55bdcd0d457969d0041f72", size = 10723228, upload-time = "2024-10-03T17:14:19.587Z" }, + { url = "https://files.pythonhosted.org/packages/4f/76/67747e49dc758daae06f33aad8247b718cd7d224f091d2cd552681215bb2/statsmodels-0.14.4-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:e31b95ac603415887c9f0d344cb523889cf779bc52d68e27e2d23c358958fec7", size = 10859503, upload-time = "2024-10-03T17:14:32.798Z" }, + { url = "https://files.pythonhosted.org/packages/1d/eb/cb8b01f5edf8f135eb3d0553d159db113a35b2948d0e51eeb735e7ae09ea/statsmodels-0.14.4-cp313-cp313-win_amd64.whl", hash = "sha256:81030108d27aecc7995cac05aa280cf8c6025f6a6119894eef648997936c2dd0", size = 9817574, upload-time = "2024-10-03T16:14:37.461Z" }, +] + [[package]] name = "tenacity" version = "9.1.2" @@ -2176,6 +2232,24 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/e5/30/643397144bfbfec6f6ef821f36f33e57d35946c44a2352d3c9f0ae847619/tenacity-9.1.2-py3-none-any.whl", hash = "sha256:f77bf36710d8b73a50b2dd155c97b870017ad21afe6ab300326b0371b3b05138", size = 28248, upload-time = "2025-04-02T08:25:07.678Z" }, ] +[[package]] +name = "threadpoolctl" +version = "3.6.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/b7/4d/08c89e34946fce2aec4fbb45c9016efd5f4d7f24af8e5d93296e935631d8/threadpoolctl-3.6.0.tar.gz", hash = "sha256:8ab8b4aa3491d812b623328249fab5302a68d2d71745c8a4c719a2fcaba9f44e", size = 21274, upload-time = "2025-03-13T13:49:23.031Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/32/d5/f9a850d79b0851d1d4ef6456097579a9005b31fea68726a4ae5f2d82ddd9/threadpoolctl-3.6.0-py3-none-any.whl", hash = "sha256:43a0b8fd5a2928500110039e43a5eed8480b918967083ea48dc3ab9f13c4a7fb", size = 18638, upload-time = "2025-03-13T13:49:21.846Z" }, +] + +[[package]] +name = "tinygrad" +version = "0.10.3" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/d8/26/ed13ccfe488d550726caaaacaf245aa79f40b9153ad6d5c99012a940cc5a/tinygrad-0.10.3.tar.gz", hash = "sha256:e79184ce68a42599e313bacd7283425f4ae2772cc117e0bb5b437465115cb1bf", size = 10352294, upload-time = "2025-05-14T22:46:28.606Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/23/92/6cab3019b093cda0ca863d72d37dc033e0e859ebd3828a89ae56d1145cc1/tinygrad-0.10.3-py3-none-any.whl", hash = "sha256:2b3e19cc427ed9d55d70e785d7e540b4ed2a3460b89a8ef411e4911b13b7e4c1", size = 10561815, upload-time = "2025-05-14T22:46:25.882Z" }, +] + [[package]] name = "typing-extensions" version = "4.13.2" diff --git a/workflows/prediction_model.py b/workflows/prediction_model.py index f03ad5dc3..986821143 100644 --- a/workflows/prediction_model.py +++ b/workflows/prediction_model.py @@ -42,8 +42,8 @@ def store_model(model_bytes: bytes) -> str: @workflow -def training_workflow(start_date: datetime, end_date: datetime) -> str: +def training_workflow(start_date: datetime, end_date: datetime) -> None: data = fetch_data(start_date=start_date, end_date=end_date) model_bytes = train_dummy_model(data=data) artifact_path = store_model(model_bytes=model_bytes) - return artifact_path + return