ttsim-dev · hmgaudecker · Apr 26, 2025 · Apr 16, 2025 · Apr 16, 2025 · Apr 25, 2025
diff --git a/src/_gettsim/interface.py b/src/_gettsim/interface.py
@@ -0,0 +1,89 @@
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+
+from _gettsim.config import RESOURCE_DIR, SUPPORTED_GROUPINGS
+from ttsim import (
+    compute_taxes_and_transfers,
+    create_data_tree_from_df,
+    set_up_policy_environment,
+)
+
+if TYPE_CHECKING:
+    import pandas as pd
+    from dags.tree.typing import NestedTargetDict
+
+    from ttsim.typing import NestedDataDict, NestedInputsPathsToDfColumns
+
+
+def oss(
+    date: str,
+    df: pd.DataFrame,
+    inputs_tree_to_df_columns: NestedInputsPathsToDfColumns,
+    targets_tree: NestedTargetDict,
+) -> NestedDataDict:
+    """One-stop-shop for computing taxes and transfers.
+
+    Args:
+        date:
+            The date to compute taxes and transfers for. The date determines the policy
+            environment for which the taxes and transfers are computed.
+        df:
+            The DataFrame containing the data.
+        inputs_tree_to_df_columns:
+            A nested dictionary that maps GETTSIM's expected input structure to the data
+            provided by the user. Keys are strings that provide a path to an input.
+
+            Values can be:
+            - Strings that reference column names in the DataFrame.
+            - Numeric or boolean values (which will be broadcasted to match the length
+              of the DataFrame).
+        targets_tree:
+            The targets tree.
+
+
+    Examples:
+    --------
+    >>> inputs_tree_to_df_columns = {
+    ...     "einkommensteuer": {
+    ...         "gemeinsam_veranlagt": "joint_taxation",
+    ...         "einkünfte": {
+    ...             "aus_nichtselbstständiger_arbeit": {
+    ...                 "bruttolohn_m": "gross_wage_m",
+    ...             },
+    ...         },
+    ...     },
+    ...     "alter": 30,
+    ...     "p_id": "p_id",
+    ... }
+    >>> df = pd.DataFrame(
+    ...     {
+    ...         "gross_wage_m": [1000, 2000, 3000],
+    ...         "joint_taxation": [True, True, False],
+    ...         "p_id": [0, 1, 2],
+    ...     }
+    ... )
+    >>> oss(
+    ...     date="2024-01-01",
+    ...     inputs_tree_to_df_columns=inputs_tree_to_df_columns,
+    ...     targets_tree=targets_tree,
+    ...     df=df,
+    ... )
+    """
+    data_tree = create_data_tree_from_df(
+        inputs_tree_to_df_columns=inputs_tree_to_df_columns,
+        df=df,
+    )
+    policy_environment = set_up_policy_environment(
+        date=date,
+        resource_dir=RESOURCE_DIR,
+    )
+    return compute_taxes_and_transfers(
+        data_tree=data_tree,
+        environment=policy_environment,
+        targets_tree=targets_tree,
+        supported_groupings=SUPPORTED_GROUPINGS,
+        rounding=True,
+        debug=False,
+        jit=False,
+    )
diff --git a/src/_gettsim_tests/utils.py b/src/_gettsim_tests/utils.py
@@ -14,8 +14,8 @@
     compute_taxes_and_transfers,
     merge_trees,
     set_up_policy_environment,
+    to_datetime,
 )
-from ttsim.shared import to_datetime
 
 # Set display options to show all columns without truncation
 pd.set_option("display.max_columns", None)

diff --git a/src/ttsim/__init__.py b/src/ttsim/__init__.py
@@ -5,18 +5,16 @@
     FunctionsAndColumnsOverlapWarning,
     compute_taxes_and_transfers,
 )
-from ttsim.loader import (
-    ConflictingTimeDependentObjectsError,
-    get_active_ttsim_objects_tree_from_module,
-    load_objects_tree_for_date,
-)
+from ttsim.loader import ConflictingTimeDependentObjectsError
 from ttsim.piecewise_polynomial import get_piecewise_parameters, piecewise_polynomial
 from ttsim.policy_environment import PolicyEnvironment, set_up_policy_environment
+from ttsim.prepare_data import create_data_tree_from_df
 from ttsim.rounding import RoundingSpec
 from ttsim.shared import (
     insert_path_and_value,
     join,
     merge_trees,
+    to_datetime,
     upsert_path_and_value,
     upsert_tree,
 )
@@ -53,19 +51,18 @@
     "agg_by_p_id_function",
     "combine_policy_functions_and_derived_functions",
     "compute_taxes_and_transfers",
+    "create_data_tree_from_df",
     "create_time_conversion_functions",
-    "get_active_ttsim_objects_tree_from_module",
-    "get_piecewise_parameters",
     "group_creation_function",
     "insert_path_and_value",
     "join",
-    "load_objects_tree_for_date",
     "merge_trees",
     "piecewise_polynomial",
     "plot_dag",
     "policy_function",
     "policy_input",
     "set_up_policy_environment",
+    "to_datetime",
     "upsert_path_and_value",
     "upsert_tree",
 ]
diff --git a/src/ttsim/prepare_data.py b/src/ttsim/prepare_data.py
@@ -0,0 +1,162 @@
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+
+import dags.tree as dt
+import optree
+import pandas as pd
+
+from ttsim.shared import format_errors_and_warnings, format_list_linewise
+
+if TYPE_CHECKING:
+    from ttsim.typing import NestedDataDict, NestedInputsPathsToDfColumns
+
+
+def create_data_tree_from_df(
+    inputs_tree_to_df_columns: NestedInputsPathsToDfColumns,
+    df: pd.DataFrame,
+) -> NestedDataDict:
+    """Transform a pandas DataFrame to a nested dictionary expected by TTSIM.
+    `
+        Args
+        ----
+            inputs_tree_to_df_columns:
+                A nested dictionary that defines the structure of the output tree. Keys
+                are strings that define the nested structure. Values can be:
+
+                - Strings that reference column names in the DataFrame.
+                - Numeric or boolean values (which will be broadcasted to match the
+                  DataFrame length)
+            df:
+                The pandas DataFrame containing the source data.
+
+        Returns
+        -------
+            A nested dictionary structure containing the data organized according to the
+            mapping definition.
+
+        Examples
+        --------
+            >>> df = pd.DataFrame({
+            ...     "a": [1, 2, 3],
+            ...     "b": [4, 5, 6],
+            ...     "c": [7, 8, 9],
+            ... })
+            >>> inputs_tree_to_df_columns = {
+            ...     "n1": {
+            ...         "n2": "a",
+            ...         "n3": "b",
+            ...     },
+            ...     "n4": 3,
+            ... }
+            >>> result = create_data_tree(
+            ...     inputs_tree_to_df_columns=inputs_tree_to_df_columns,
+            ...     df=df,
+            ... )
+            >>> result
+            {
+                "n1": {
+                    "n2": pd.Series([1, 2, 3]),
+                    "n3": pd.Series([4, 5, 6]),
+                },
+                "n4": pd.Series([3, 3, 3]),
+            }
+
+
+    """
+    _fail_if_df_has_bool_or_numeric_column_names(df)
+    _fail_if_mapper_has_incorrect_format(inputs_tree_to_df_columns)
+
+    qualified_inputs_tree_to_df_columns = dt.flatten_to_qual_names(
+        inputs_tree_to_df_columns
+    )
+
+    name_to_input_series = {}
+    for (
+        qualified_input_name,
+        input_value,
+    ) in qualified_inputs_tree_to_df_columns.items():
+        if input_value in df.columns:
+            name_to_input_series[qualified_input_name] = df[input_value]
+        else:
+            name_to_input_series[qualified_input_name] = pd.Series(
+                [input_value] * len(df),
+                index=df.index,
+            )
+
+    return dt.unflatten_from_qual_names(name_to_input_series)
+
+
+def _fail_if_mapper_has_incorrect_format(
+    inputs_tree_to_df_columns: NestedInputsPathsToDfColumns,
+) -> None:
+    """Fail if the input tree to column name mapping has an incorrect format."""
+    if not isinstance(inputs_tree_to_df_columns, dict):
+        msg = format_errors_and_warnings(
+            """The input tree to column mapping must be a (nested) dictionary. Call
+            `create_input_structure` to create a template."""
+        )
+        raise TypeError(msg)
+
+    non_string_paths = [
+        str(path)
+        for path in optree.tree_paths(inputs_tree_to_df_columns, none_is_leaf=True)
+        if not all(isinstance(part, str) for part in path)
+    ]
+    if non_string_paths:
+        msg = format_errors_and_warnings(
+            f"""All path elements of `inputs_tree_to_df_columns` must be strings.
+            Found the following paths that contain non-string elements:
+
+            {format_list_linewise(non_string_paths)}
+
+            Call `create_input_structure` to create a template.
+            """
+        )
+        raise TypeError(msg)
+
+    incorrect_types = {
+        k: type(v)
+        for k, v in dt.flatten_to_qual_names(inputs_tree_to_df_columns).items()
+        if not isinstance(v, str | int | bool)
+    }
+    if incorrect_types:
+        formatted_incorrect_types = "\n".join(
+            f"    - {k}: {v.__name__}" for k, v in incorrect_types.items()
+        )
+        msg = format_errors_and_warnings(
+            f"""Values of the input tree to column mapping must be strings, integers,
+            or booleans.
+            Found the following incorrect types:
+
+            {formatted_incorrect_types}
+            """
+        )
+        raise TypeError(msg)
+
+
+def _fail_if_df_has_bool_or_numeric_column_names(df: pd.DataFrame) -> None:
+    """Fail if the DataFrame has bool or numeric column names."""
+    common_msg = format_errors_and_warnings(
+        """DataFrame column names cannot be booleans or numbers. This restriction
+        prevents ambiguity between actual column references and values intended for
+        broadcasting.
+        """
+    )
+    bool_column_names = [col for col in df.columns if isinstance(col, bool)]
+    numeric_column_names = [
+        col
+        for col in df.columns
+        if isinstance(col, (int, float)) or (isinstance(col, str) and col.isnumeric())
+    ]
+
+    if bool_column_names or numeric_column_names:
+        msg = format_errors_and_warnings(
+            f"""
+            {common_msg}
+
+            Boolean column names: {bool_column_names}.
+            Numeric column names: {numeric_column_names}.
+            """
+        )
+        raise ValueError(msg)
diff --git a/src/ttsim/typing.py b/src/ttsim/typing.py
@@ -1,4 +1,4 @@
-from typing import TYPE_CHECKING, NewType
+from typing import TYPE_CHECKING, Any, NewType
 
 if TYPE_CHECKING:
     from collections.abc import Mapping
@@ -25,6 +25,7 @@
     QualNamePolicyInputDict = Mapping[str, PolicyInput]
 
     # Specialise from dags' NestedInputDict to GETTSIM's types.
+    NestedInputsPathsToDfColumns = Mapping[str, Any | "NestedInputsPathsToDfColumns"]
     NestedDataDict = Mapping[str, pd.Series | "NestedDataDict"]
     QualNameDataDict = Mapping[str, pd.Series]
     NestedArrayDict = Mapping[str, np.ndarray | "NestedArrayDict"]

diff --git a/tests/ttsim/test_combine_functions.py b/tests/ttsim/test_combine_functions.py
@@ -1,16 +1,16 @@
 import pandas as pd
 import pytest
 
-from ttsim.aggregation import AggType
-from ttsim.automatically_added_functions import create_agg_by_group_functions
-from ttsim.combine_functions import _fail_if_targets_not_in_functions
-from ttsim.compute_taxes_and_transfers import compute_taxes_and_transfers
-from ttsim.policy_environment import PolicyEnvironment
-from ttsim.ttsim_objects import (
+from ttsim import (
+    AggType,
+    PolicyEnvironment,
     agg_by_group_function,
+    compute_taxes_and_transfers,
     policy_function,
     policy_input,
 )
+from ttsim.automatically_added_functions import create_agg_by_group_functions
+from ttsim.combine_functions import _fail_if_targets_not_in_functions
 
 
 @pytest.fixture

diff --git a/tests/ttsim/test_compute_taxes_and_transfers.py b/tests/ttsim/test_compute_taxes_and_transfers.py
@@ -8,27 +8,29 @@
 import pytest
 from mettsim.config import RESOURCE_DIR, SUPPORTED_GROUPINGS
 
-from ttsim.aggregation import AggType
-from ttsim.compute_taxes_and_transfers import (
+from ttsim import (
+    AggType,
     FunctionsAndColumnsOverlapWarning,
+    PolicyEnvironment,
+    agg_by_group_function,
+    agg_by_p_id_function,
+    compute_taxes_and_transfers,
+    group_creation_function,
+    merge_trees,
+    policy_function,
+    policy_input,
+    set_up_policy_environment,
+)
+from ttsim.compute_taxes_and_transfers import (
     _fail_if_foreign_keys_are_invalid_in_data,
     _fail_if_group_ids_are_outside_top_level_namespace,
     _fail_if_group_variables_not_constant_within_groups,
     _fail_if_p_id_is_non_unique,
     _get_top_level_namespace,
     _partial_parameters_to_functions,
-    compute_taxes_and_transfers,
 )
 from ttsim.config import numpy_or_jax as np
-from ttsim.policy_environment import PolicyEnvironment, set_up_policy_environment
-from ttsim.shared import assert_valid_ttsim_pytree, merge_trees
-from ttsim.ttsim_objects import (
-    agg_by_group_function,
-    agg_by_p_id_function,
-    group_creation_function,
-    policy_function,
-    policy_input,
-)
+from ttsim.shared import assert_valid_ttsim_pytree
 
 
 @policy_input()