diff --git a/applications/portfoliomanager/pyproject.toml b/applications/portfoliomanager/pyproject.toml index 997cfba9c..d780991c9 100644 --- a/applications/portfoliomanager/pyproject.toml +++ b/applications/portfoliomanager/pyproject.toml @@ -5,5 +5,9 @@ description = "Portfolio prediction and construction service" requires-python = "==3.12.10" dependencies = ["internal"] +[tool.uv] +package = true +src = ["src"] + [tool.uv.sources] internal = { workspace = true } diff --git a/applications/portfoliomanager/src/portfoliomanager/preprocess.py b/applications/portfoliomanager/src/portfoliomanager/preprocess.py new file mode 100644 index 000000000..527c124d7 --- /dev/null +++ b/applications/portfoliomanager/src/portfoliomanager/preprocess.py @@ -0,0 +1,21 @@ +import polars as pl + + +def filter_equity_bars( + data: pl.DataFrame, + minimum_average_close_price: float = 10.0, + minimum_average_volume: float = 1_000_000.0, +) -> pl.DataFrame: + data = data.clone() + + return ( + data.group_by("ticker") + .agg( + avg_close_price=pl.col("close_price").mean(), + avg_volume=pl.col("volume").mean(), + ) + .filter( + (pl.col("avg_close_price") > minimum_average_close_price) + & (pl.col("avg_volume") > minimum_average_volume) + ) + ) diff --git a/applications/portfoliomanager/tests/test_preprocess.py b/applications/portfoliomanager/tests/test_preprocess.py new file mode 100644 index 000000000..f2afdf5c8 --- /dev/null +++ b/applications/portfoliomanager/tests/test_preprocess.py @@ -0,0 +1,214 @@ +import polars as pl +import pytest +from portfoliomanager.preprocess import filter_equity_bars + + +def test_filter_equity_bars_above_thresholds() -> None: + data = pl.DataFrame( + { + "ticker": ["AAPL", "AAPL", "AAPL"], + "close_price": [15.0, 20.0, 25.0], + "volume": [ + 1_500_000.0, + 2_000_000.0, + 2_500_000.0, + ], + } + ) + + result = filter_equity_bars(data) + + assert len(result) == 1 + assert result["ticker"][0] == "AAPL" + assert result["avg_close_price"][0] == 20.0 # noqa: PLR2004 + assert result["avg_volume"][0] == 2_000_000.0 # noqa: PLR2004 + + +def test_filter_equity_bars_below_price_threshold() -> None: + data = pl.DataFrame( + { + "ticker": ["AAPL", "AAPL", "AAPL"], + "close_price": [5.0, 8.0, 9.0], + "volume": [ + 1_500_000.0, + 2_000_000.0, + 2_500_000.0, + ], + } + ) + + result = filter_equity_bars(data) + + assert len(result) == 0 + + +def test_filter_equity_bars_below_volume_threshold() -> None: + data = pl.DataFrame( + { + "ticker": ["AAPL", "AAPL", "AAPL"], + "close_price": [15.0, 20.0, 25.0], + "volume": [500_000.0, 600_000.0, 700_000.0], + } + ) + + result = filter_equity_bars(data) + + assert len(result) == 0 + + +def test_filter_equity_bars_below_both_thresholds() -> None: + data = pl.DataFrame( + { + "ticker": ["AAPL", "AAPL", "AAPL"], + "close_price": [5.0, 6.0, 7.0], + "volume": [500_000.0, 600_000.0, 700_000.0], + } + ) + + result = filter_equity_bars(data) + + assert len(result) == 0 + + +def test_filter_equity_bars_at_exact_thresholds() -> None: + data = pl.DataFrame( + { + "ticker": ["AAPL", "AAPL", "AAPL"], + "close_price": [ + 10.0, + 10.0, + 10.0, + ], + "volume": [ + 1_000_000.0, + 1_000_000.0, + 1_000_000.0, + ], + } + ) + + result = filter_equity_bars(data) + + assert len(result) == 0 + + +def test_filter_equity_bars_just_above_thresholds() -> None: + data = pl.DataFrame( + { + "ticker": ["AAPL", "AAPL", "AAPL"], + "close_price": [10.01, 10.01, 10.01], + "volume": [ + 1_000_001.0, + 1_000_001.0, + 1_000_001.0, + ], + } + ) + + result = filter_equity_bars(data) + + assert len(result) == 1 + assert result["ticker"][0] == "AAPL" + assert result["avg_close_price"][0] == pytest.approx(10.01) + assert result["avg_volume"][0] == pytest.approx(1_000_001.0) + + +def test_filter_equity_bars_empty_dataframe() -> None: + data = pl.DataFrame( + { + "ticker": [], + "close_price": [], + "volume": [], + } + ) + + result = filter_equity_bars(data) + + assert len(result) == 0 + + +def test_filter_equity_bars_single_row() -> None: + data = pl.DataFrame( + { + "ticker": ["AAPL"], + "close_price": [15.0], + "volume": [1_500_000.0], + } + ) + + result = filter_equity_bars(data) + + assert len(result) == 1 + assert result["ticker"][0] == "AAPL" + assert result["avg_close_price"][0] == 15.0 # noqa: PLR2004 + assert result["avg_volume"][0] == 1_500_000.0 # noqa: PLR2004 + + +def test_filter_equity_bars_mixed_values() -> None: + data = pl.DataFrame( + { + "ticker": ["AAPL", "AAPL"], + "close_price": [5.0, 25.0], + "volume": [ + 500_000.0, + 1_500_000.0, + ], + } + ) + + result = filter_equity_bars(data) + + assert len(result) == 0 + + +def test_filter_equity_bars_multiple_tickers() -> None: + data = pl.DataFrame( + { + "ticker": ["AAPL", "AAPL", "AAPL", "GOOGL", "GOOGL", "TSLA", "TSLA"], + "close_price": [ + 15.0, + 20.0, + 25.0, + 5.0, + 6.0, + 12.0, + 18.0, + ], + "volume": [ + 1_500_000.0, + 2_000_000.0, + 2_500_000.0, + 2_000_000.0, + 3_000_000.0, + 800_000.0, + 900_000.0, + ], + } + ) + + result = filter_equity_bars(data) + + assert len(result) == 1 + assert result["ticker"][0] == "AAPL" + assert result["avg_close_price"][0] == 20.0 # noqa: PLR2004 + assert result["avg_volume"][0] == 2_000_000.0 # noqa: PLR2004 + + +def test_filter_equity_bars_data_immutability() -> None: + original_data = pl.DataFrame( + { + "ticker": ["AAPL", "AAPL", "AAPL"], + "close_price": [15.0, 20.0, 25.0], + "volume": [1_500_000.0, 2_000_000.0, 2_500_000.0], + } + ) + + original_tickers = original_data["ticker"].to_list() + original_close_prices = original_data["close_price"].to_list() + original_volumes = original_data["volume"].to_list() + + filter_equity_bars(original_data) + + assert original_data["ticker"].to_list() == original_tickers + assert original_data["close_price"].to_list() == original_close_prices + assert original_data["volume"].to_list() == original_volumes diff --git a/libraries/python/src/internal/tft_dataset.py b/libraries/python/src/internal/tft_dataset.py index 0ef2f61b4..aa075916f 100644 --- a/libraries/python/src/internal/tft_dataset.py +++ b/libraries/python/src/internal/tft_dataset.py @@ -1,11 +1,9 @@ -from typing import TYPE_CHECKING +from datetime import date +import pandera.polars as pa import polars as pl from tinygrad.tensor import Tensor -if TYPE_CHECKING: - from datetime import date - class Scaler: def __init__(self) -> None: @@ -29,6 +27,8 @@ class TFTDataset: """Temporal fusion transformer dataset.""" def __init__(self, data: pl.DataFrame) -> None: + data = data.clone() + raw_columns = ( "ticker", "timestamp", @@ -137,29 +137,33 @@ def __init__(self, data: pl.DataFrame) -> None: pl.col("timestamp").fill_null( pl.col("date") .cast(pl.Datetime) - .dt.replace_time_zone("America/New_York") + .dt.replace_time_zone("UTC") .cast(pl.Int64) .floordiv(1000) ), ] ) - data = data.with_columns( # compute new columns - pl.col("date").dt.weekday().alias("day_of_week"), - pl.col("date").dt.day().alias("day_of_month"), - pl.col("date").dt.ordinal_day().alias("day_of_year"), - pl.col("date").dt.month().alias("month"), - pl.col("date").dt.year().alias("year"), + # compute new calendar columns + data = data.with_columns( + pl.col("date").dt.weekday().alias("day_of_week").cast(pl.Int64), + pl.col("date").dt.day().alias("day_of_month").cast(pl.Int64), + pl.col("date").dt.ordinal_day().alias("day_of_year").cast(pl.Int64), + pl.col("date").dt.month().alias("month").cast(pl.Int64), + pl.col("date").dt.year().alias("year").cast(pl.Int64), ) - data = data.sort(["ticker", "timestamp"]).with_columns( # add time index column + # add time index column + data = data.sort(["ticker", "timestamp"]).with_columns( pl.col("timestamp") .rank("dense") .over("ticker") - .cast(pl.Int32) + .cast(pl.Int64) .alias("time_idx") ) + data = dataset_schema.validate(data) + self.scaler = Scaler() self.scaler.fit(data[self.continuous_columns]) @@ -321,3 +325,58 @@ def get_batches( batches.append(batch) return batches + + +dataset_schema = pa.DataFrameSchema( + { + "ticker": pa.Column( + str, + checks=pa.Check.str_matches(r"^[A-Z0-9.\-]+$"), + coerce=True, + required=True, + ), + "timestamp": pa.Column( + int, + checks=pa.Check.gt(0), + coerce=True, + required=True, + ), + "open_price": pa.Column( + float, + checks=pa.Check.ge(0), + coerce=True, + required=True, + ), + "high_price": pa.Column( + float, checks=pa.Check.ge(0), coerce=True, required=True + ), + "low_price": pa.Column( + float, checks=pa.Check.ge(0), coerce=True, required=True + ), + "close_price": pa.Column( + float, checks=pa.Check.ge(0), coerce=True, required=True + ), + "volume": pa.Column( + int, + checks=pa.Check.ge(0), + coerce=True, + required=True, + ), + "volume_weighted_average_price": pa.Column( + float, + checks=pa.Check.ge(0), + coerce=True, + required=True, + ), + "sector": pa.Column(str, coerce=True, required=True), + "industry": pa.Column(str, coerce=True, required=True), + "date": pa.Column(date, coerce=True, required=True), + "day_of_week": pa.Column(int, coerce=True, required=True), + "day_of_month": pa.Column(int, coerce=True, required=True), + "day_of_year": pa.Column(int, coerce=True, required=True), + "month": pa.Column(int, coerce=True, required=True), + "year": pa.Column(int, coerce=True, required=True), + "is_holiday": pa.Column(bool, coerce=True, required=True), + "time_idx": pa.Column(int, coerce=True, required=True), + } +) diff --git a/uv.lock b/uv.lock index bb7945cc3..185b8b62c 100644 --- a/uv.lock +++ b/uv.lock @@ -1,5 +1,5 @@ version = 1 -revision = 3 +revision = 2 requires-python = "==3.12.10" resolution-markers = [ "sys_platform == 'linux'", @@ -1366,7 +1366,7 @@ wheels = [ [[package]] name = "portfoliomanager" version = "0.1.0" -source = { virtual = "applications/portfoliomanager" } +source = { editable = "applications/portfoliomanager" } dependencies = [ { name = "internal" }, ]