Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
89 changes: 89 additions & 0 deletions src/_gettsim/interface.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
from __future__ import annotations

from typing import TYPE_CHECKING

from _gettsim.config import RESOURCE_DIR, SUPPORTED_GROUPINGS
from ttsim import (
compute_taxes_and_transfers,
create_data_tree_from_df,
set_up_policy_environment,
)

if TYPE_CHECKING:
import pandas as pd
from dags.tree.typing import NestedTargetDict

Check warning on line 14 in src/_gettsim/interface.py

View check run for this annotation

Codecov / codecov/patch

src/_gettsim/interface.py#L13-L14

Added lines #L13 - L14 were not covered by tests

from ttsim.typing import NestedDataDict, NestedInputsPathsToDfColumns

Check warning on line 16 in src/_gettsim/interface.py

View check run for this annotation

Codecov / codecov/patch

src/_gettsim/interface.py#L16

Added line #L16 was not covered by tests


def oss(
date: str,
df: pd.DataFrame,
inputs_tree_to_df_columns: NestedInputsPathsToDfColumns,
targets_tree: NestedTargetDict,
) -> NestedDataDict:
"""One-stop-shop for computing taxes and transfers.

Args:
date:
The date to compute taxes and transfers for. The date determines the policy
environment for which the taxes and transfers are computed.
df:
The DataFrame containing the data.
inputs_tree_to_df_columns:
A nested dictionary that maps GETTSIM's expected input structure to the data
provided by the user. Keys are strings that provide a path to an input.

Values can be:
- Strings that reference column names in the DataFrame.
- Numeric or boolean values (which will be broadcasted to match the length
of the DataFrame).
targets_tree:
The targets tree.


Examples:
--------
>>> inputs_tree_to_df_columns = {
... "einkommensteuer": {
... "gemeinsam_veranlagt": "joint_taxation",
... "einkünfte": {
... "aus_nichtselbstständiger_arbeit": {
... "bruttolohn_m": "gross_wage_m",
... },
... },
... },
... "alter": 30,
... "p_id": "p_id",
... }
>>> df = pd.DataFrame(
... {
... "gross_wage_m": [1000, 2000, 3000],
... "joint_taxation": [True, True, False],
... "p_id": [0, 1, 2],
... }
... )
>>> oss(
... date="2024-01-01",
... inputs_tree_to_df_columns=inputs_tree_to_df_columns,
... targets_tree=targets_tree,
... df=df,
... )
"""
data_tree = create_data_tree_from_df(

Check warning on line 73 in src/_gettsim/interface.py

View check run for this annotation

Codecov / codecov/patch

src/_gettsim/interface.py#L73

Added line #L73 was not covered by tests
inputs_tree_to_df_columns=inputs_tree_to_df_columns,
df=df,
)
policy_environment = set_up_policy_environment(

Check warning on line 77 in src/_gettsim/interface.py

View check run for this annotation

Codecov / codecov/patch

src/_gettsim/interface.py#L77

Added line #L77 was not covered by tests
date=date,
resource_dir=RESOURCE_DIR,
)
return compute_taxes_and_transfers(

Check warning on line 81 in src/_gettsim/interface.py

View check run for this annotation

Codecov / codecov/patch

src/_gettsim/interface.py#L81

Added line #L81 was not covered by tests
data_tree=data_tree,
environment=policy_environment,
targets_tree=targets_tree,
supported_groupings=SUPPORTED_GROUPINGS,
rounding=True,
debug=False,
jit=False,
)
2 changes: 1 addition & 1 deletion src/_gettsim_tests/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,8 @@
compute_taxes_and_transfers,
merge_trees,
set_up_policy_environment,
to_datetime,
)
from ttsim.shared import to_datetime

# Set display options to show all columns without truncation
pd.set_option("display.max_columns", None)
Expand Down
13 changes: 5 additions & 8 deletions src/ttsim/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,18 +5,16 @@
FunctionsAndColumnsOverlapWarning,
compute_taxes_and_transfers,
)
from ttsim.loader import (
ConflictingTimeDependentObjectsError,
get_active_ttsim_objects_tree_from_module,
load_objects_tree_for_date,
)
from ttsim.loader import ConflictingTimeDependentObjectsError
from ttsim.piecewise_polynomial import get_piecewise_parameters, piecewise_polynomial
from ttsim.policy_environment import PolicyEnvironment, set_up_policy_environment
from ttsim.prepare_data import create_data_tree_from_df
from ttsim.rounding import RoundingSpec
from ttsim.shared import (
insert_path_and_value,
join,
merge_trees,
to_datetime,
upsert_path_and_value,
upsert_tree,
)
Expand Down Expand Up @@ -53,19 +51,18 @@
"agg_by_p_id_function",
"combine_policy_functions_and_derived_functions",
"compute_taxes_and_transfers",
"create_data_tree_from_df",
"create_time_conversion_functions",
"get_active_ttsim_objects_tree_from_module",
"get_piecewise_parameters",
"group_creation_function",
"insert_path_and_value",
"join",
"load_objects_tree_for_date",
"merge_trees",
"piecewise_polynomial",
"plot_dag",
"policy_function",
"policy_input",
"set_up_policy_environment",
"to_datetime",
"upsert_path_and_value",
"upsert_tree",
]
162 changes: 162 additions & 0 deletions src/ttsim/prepare_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,162 @@
from __future__ import annotations

Check warning on line 1 in src/ttsim/prepare_data.py

View check run for this annotation

Codecov / codecov/patch

src/ttsim/prepare_data.py#L1

Added line #L1 was not covered by tests

from typing import TYPE_CHECKING

Check warning on line 3 in src/ttsim/prepare_data.py

View check run for this annotation

Codecov / codecov/patch

src/ttsim/prepare_data.py#L3

Added line #L3 was not covered by tests

import dags.tree as dt
import optree
import pandas as pd

Check warning on line 7 in src/ttsim/prepare_data.py

View check run for this annotation

Codecov / codecov/patch

src/ttsim/prepare_data.py#L5-L7

Added lines #L5 - L7 were not covered by tests

from ttsim.shared import format_errors_and_warnings, format_list_linewise

Check warning on line 9 in src/ttsim/prepare_data.py

View check run for this annotation

Codecov / codecov/patch

src/ttsim/prepare_data.py#L9

Added line #L9 was not covered by tests

if TYPE_CHECKING:
from ttsim.typing import NestedDataDict, NestedInputsPathsToDfColumns

Check warning on line 12 in src/ttsim/prepare_data.py

View check run for this annotation

Codecov / codecov/patch

src/ttsim/prepare_data.py#L11-L12

Added lines #L11 - L12 were not covered by tests


def create_data_tree_from_df(

Check warning on line 15 in src/ttsim/prepare_data.py

View check run for this annotation

Codecov / codecov/patch

src/ttsim/prepare_data.py#L15

Added line #L15 was not covered by tests
inputs_tree_to_df_columns: NestedInputsPathsToDfColumns,
df: pd.DataFrame,
) -> NestedDataDict:
"""Transform a pandas DataFrame to a nested dictionary expected by TTSIM.
`
Args
----
inputs_tree_to_df_columns:
A nested dictionary that defines the structure of the output tree. Keys
are strings that define the nested structure. Values can be:

- Strings that reference column names in the DataFrame.
- Numeric or boolean values (which will be broadcasted to match the
DataFrame length)
df:
The pandas DataFrame containing the source data.

Returns
-------
A nested dictionary structure containing the data organized according to the
mapping definition.

Examples
--------
>>> df = pd.DataFrame({
... "a": [1, 2, 3],
... "b": [4, 5, 6],
... "c": [7, 8, 9],
... })
>>> inputs_tree_to_df_columns = {
... "n1": {
... "n2": "a",
... "n3": "b",
... },
... "n4": 3,
... }
>>> result = create_data_tree(
... inputs_tree_to_df_columns=inputs_tree_to_df_columns,
... df=df,
... )
>>> result
{
"n1": {
"n2": pd.Series([1, 2, 3]),
"n3": pd.Series([4, 5, 6]),
},
"n4": pd.Series([3, 3, 3]),
}


"""
_fail_if_df_has_bool_or_numeric_column_names(df)
_fail_if_mapper_has_incorrect_format(inputs_tree_to_df_columns)

qualified_inputs_tree_to_df_columns = dt.flatten_to_qual_names(
inputs_tree_to_df_columns
)

name_to_input_series = {}
for (
qualified_input_name,
input_value,
) in qualified_inputs_tree_to_df_columns.items():
if input_value in df.columns:
name_to_input_series[qualified_input_name] = df[input_value]
else:
name_to_input_series[qualified_input_name] = pd.Series(
[input_value] * len(df),
index=df.index,
)

return dt.unflatten_from_qual_names(name_to_input_series)


def _fail_if_mapper_has_incorrect_format(

Check warning on line 90 in src/ttsim/prepare_data.py

View check run for this annotation

Codecov / codecov/patch

src/ttsim/prepare_data.py#L90

Added line #L90 was not covered by tests
inputs_tree_to_df_columns: NestedInputsPathsToDfColumns,
) -> None:
"""Fail if the input tree to column name mapping has an incorrect format."""
if not isinstance(inputs_tree_to_df_columns, dict):
msg = format_errors_and_warnings(
"""The input tree to column mapping must be a (nested) dictionary. Call
`create_input_structure` to create a template."""
)
raise TypeError(msg)

non_string_paths = [
str(path)
for path in optree.tree_paths(inputs_tree_to_df_columns, none_is_leaf=True)
if not all(isinstance(part, str) for part in path)
]
if non_string_paths:
msg = format_errors_and_warnings(
f"""All path elements of `inputs_tree_to_df_columns` must be strings.
Found the following paths that contain non-string elements:

{format_list_linewise(non_string_paths)}

Call `create_input_structure` to create a template.
"""
)
raise TypeError(msg)

incorrect_types = {
k: type(v)
for k, v in dt.flatten_to_qual_names(inputs_tree_to_df_columns).items()
if not isinstance(v, str | int | bool)
}
if incorrect_types:
formatted_incorrect_types = "\n".join(
f" - {k}: {v.__name__}" for k, v in incorrect_types.items()
)
msg = format_errors_and_warnings(
f"""Values of the input tree to column mapping must be strings, integers,
or booleans.
Found the following incorrect types:

{formatted_incorrect_types}
"""
)
raise TypeError(msg)


def _fail_if_df_has_bool_or_numeric_column_names(df: pd.DataFrame) -> None:

Check warning on line 138 in src/ttsim/prepare_data.py

View check run for this annotation

Codecov / codecov/patch

src/ttsim/prepare_data.py#L138

Added line #L138 was not covered by tests
"""Fail if the DataFrame has bool or numeric column names."""
common_msg = format_errors_and_warnings(
"""DataFrame column names cannot be booleans or numbers. This restriction
prevents ambiguity between actual column references and values intended for
broadcasting.
"""
)
bool_column_names = [col for col in df.columns if isinstance(col, bool)]
numeric_column_names = [
col
for col in df.columns
if isinstance(col, (int, float)) or (isinstance(col, str) and col.isnumeric())
]

if bool_column_names or numeric_column_names:
msg = format_errors_and_warnings(
f"""
{common_msg}

Boolean column names: {bool_column_names}.
Numeric column names: {numeric_column_names}.
"""
)
raise ValueError(msg)
3 changes: 2 additions & 1 deletion src/ttsim/typing.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from typing import TYPE_CHECKING, NewType
from typing import TYPE_CHECKING, Any, NewType

if TYPE_CHECKING:
from collections.abc import Mapping
Expand All @@ -25,6 +25,7 @@
QualNamePolicyInputDict = Mapping[str, PolicyInput]

# Specialise from dags' NestedInputDict to GETTSIM's types.
NestedInputsPathsToDfColumns = Mapping[str, Any | "NestedInputsPathsToDfColumns"]
NestedDataDict = Mapping[str, pd.Series | "NestedDataDict"]
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just a heads-up that the current type NestedDataDict is nonsense (we never use series) and will change via #879.

QualNameDataDict = Mapping[str, pd.Series]
NestedArrayDict = Mapping[str, np.ndarray | "NestedArrayDict"]
Expand Down
12 changes: 6 additions & 6 deletions tests/ttsim/test_combine_functions.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,16 @@
import pandas as pd
import pytest

from ttsim.aggregation import AggType
from ttsim.automatically_added_functions import create_agg_by_group_functions
from ttsim.combine_functions import _fail_if_targets_not_in_functions
from ttsim.compute_taxes_and_transfers import compute_taxes_and_transfers
from ttsim.policy_environment import PolicyEnvironment
from ttsim.ttsim_objects import (
from ttsim import (
AggType,
PolicyEnvironment,
agg_by_group_function,
compute_taxes_and_transfers,
policy_function,
policy_input,
)
from ttsim.automatically_added_functions import create_agg_by_group_functions
from ttsim.combine_functions import _fail_if_targets_not_in_functions


@pytest.fixture
Expand Down
26 changes: 14 additions & 12 deletions tests/ttsim/test_compute_taxes_and_transfers.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,27 +8,29 @@
import pytest
from mettsim.config import RESOURCE_DIR, SUPPORTED_GROUPINGS

from ttsim.aggregation import AggType
from ttsim.compute_taxes_and_transfers import (
from ttsim import (
AggType,
FunctionsAndColumnsOverlapWarning,
PolicyEnvironment,
agg_by_group_function,
agg_by_p_id_function,
compute_taxes_and_transfers,
group_creation_function,
merge_trees,
policy_function,
policy_input,
set_up_policy_environment,
)
from ttsim.compute_taxes_and_transfers import (
_fail_if_foreign_keys_are_invalid_in_data,
_fail_if_group_ids_are_outside_top_level_namespace,
_fail_if_group_variables_not_constant_within_groups,
_fail_if_p_id_is_non_unique,
_get_top_level_namespace,
_partial_parameters_to_functions,
compute_taxes_and_transfers,
)
from ttsim.config import numpy_or_jax as np
from ttsim.policy_environment import PolicyEnvironment, set_up_policy_environment
from ttsim.shared import assert_valid_ttsim_pytree, merge_trees
from ttsim.ttsim_objects import (
agg_by_group_function,
agg_by_p_id_function,
group_creation_function,
policy_function,
policy_input,
)
from ttsim.shared import assert_valid_ttsim_pytree


@policy_input()
Expand Down
Loading
Loading