diff --git a/docs/cycle/Linear and Cyclical Workflows using Functions and States.ipynb b/docs/cycle/Linear and Cyclical Workflows using Functions and States.ipynb new file mode 100644 index 00000000..d9473ab7 --- /dev/null +++ b/docs/cycle/Linear and Cyclical Workflows using Functions and States.ipynb @@ -0,0 +1,1101 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Linear And Cyclical Workflows Using Functions And States" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Using the functions in `autora.state`, we can build flexible pipelines and cycles which operate on state objects.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from typing import Optional\n", + "from dataclasses import field, dataclass\n", + "\n", + "import numpy as np\n", + "from matplotlib import pyplot as plt\n", + "import pandas as pd\n", + "from sklearn.base import BaseEstimator\n", + "from sklearn.linear_model import LinearRegression\n", + "from sklearn.pipeline import make_pipeline\n", + "from sklearn.preprocessing import PolynomialFeatures\n", + "\n", + "from autora.state.delta import State, Delta, wrap_to_use_state\n", + "from autora.state.wrapper import theorist_from_estimator, experiment_runner_from_x_to_y_function\n", + "from autora.variable import VariableCollection, Variable\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Experiment Runner And Theorist" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We define a two part AER pipeline consisting of an experiment runner and a theorist (we use the seed conditions\n", + "always).\n", + "\n", + "The key part here is that both experiment runner and theorist are functions which:\n", + "- operate on the `State`, and\n", + "- return a modified object of the **same type** `State`.\n", + "\n", + "### Defining The State\n", + "\n", + "We define the state as a dataclass, subclassed from `autora.state.delta.State` with fields representing the variables,\n", + "parameters, experimental data, (possibly) conditions, and (possibly) a model.\n", + "\n", + "This state has no \"history\"; it represents a snapshot of the data at one time. Other exemplar state objects are\n", + "available in the subpackage `autora.state` and include some with in-built histories." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "@dataclass(frozen=True)\n", + "class Snapshot(State):\n", + " variables: VariableCollection = field(metadata={\"delta\": \"replace\"})\n", + " params: dict = field(metadata={\"delta\": \"replace\"})\n", + " experiment_data: pd.DataFrame = field(metadata={\"delta\": \"extend\"})\n", + " conditions: pd.Series = field(default=None, metadata={\"delta\": \"replace\"})\n", + " model: Optional[BaseEstimator] = field(default=None, metadata={\"delta\": \"replace\"})\n", + "\n", + "s = Snapshot(\n", + " variables=VariableCollection(independent_variables=[Variable(\"x\", value_range=(-15,15))],\n", + " dependent_variables=[Variable(\"y\")]),\n", + " params={},\n", + " conditions=pd.DataFrame({\"x\": np.linspace(-15,15,101)}),\n", + " experiment_data = pd.DataFrame(columns=[\"x\",\"y\"]),\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Snapshot(variables=VariableCollection(independent_variables=[Variable(name='x', value_range=(-15, 15), allowed_values=None, units='', type=, variable_label='', rescale=1, is_covariate=False)], dependent_variables=[Variable(name='y', value_range=None, allowed_values=None, units='', type=, variable_label='', rescale=1, is_covariate=False)], covariates=[]), params={}, experiment_data=Empty DataFrame\n", + "Columns: [x, y]\n", + "Index: [], conditions= x\n", + "0 -15.0\n", + "1 -14.7\n", + "2 -14.4\n", + "3 -14.1\n", + "4 -13.8\n", + ".. ...\n", + "96 13.8\n", + "97 14.1\n", + "98 14.4\n", + "99 14.7\n", + "100 15.0\n", + "\n", + "[101 rows x 1 columns], model=None)" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "s" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Defining The Experiment Runner\n", + "\n", + "For this example, we'll use a polynomial of degree 3 as our \"ground truth\" function. We're also using pandas\n", + "DataFrames and Series as our data interchange format." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "coefs = [432, -144, -3, 1] # from https://www.maa.org/sites/default/files/0025570x28304.di021116.02p0130a.pdf\n", + "\n", + "def ground_truth(x: pd.Series) -> pd.Series:\n", + " y = pd.Series(coefs[0] + coefs[1] * x + coefs[2] * x**2 + coefs[3] * x**3, name=\"y\")\n", + " return y\n", + "\n", + "def noisy_observation(x: pd.Series, std=1000, rng=None) -> pd.Series:\n", + " if rng is None:\n", + " rng = np.random.default_rng()\n", + " y = ground_truth(x) + rng.normal(0, std, len(x))\n", + " return y\n", + "\n", + "def noisy_observation_df(df: pd.DataFrame, std=1000, rng=None) -> pd.DataFrame:\n", + " y = pd.DataFrame({\"y\": noisy_observation(df[\"x\"], std=std, rng=rng)}) \n", + " return y" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Given this state, we define a two part AER pipeline consisting of an experiment runner and a theorist. We'll just\n", + "reuse the initial seed `conditions` in this example.\n", + "\n", + "First we define and test the experiment runner.\n", + "\n", + "The key part here is that both the experiment runner and the theorist are functions which operate on the `State`. Therefore, we use a wrapper function `experiment_runner_from_x_to_y_function` that wraps the previously defined `noisy_observation_df` function and returns a function with the same functionality, but operating on the `State`. In this case, we want to use the `State` field `conditions` as input and extend the `State` field `experiment_data`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "experiment_runner = experiment_runner_from_x_to_y_function(noisy_observation_df)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "When we run the experiment runner, we can see the updated state object which is returned – it has new experimental data." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
xy
0-15.0-1457.368847
1-14.7-1276.238863
2-14.4-1101.204891
3-14.1-938.114081
4-13.8-780.385208
.........
9613.8501.515831
9714.1608.036899
9814.4722.340558
9914.7844.702516
10015.0972.762139
\n", + "

101 rows × 2 columns

\n", + "
" + ], + "text/plain": [ + " x y\n", + "0 -15.0 -1457.368847\n", + "1 -14.7 -1276.238863\n", + "2 -14.4 -1101.204891\n", + "3 -14.1 -938.114081\n", + "4 -13.8 -780.385208\n", + ".. ... ...\n", + "96 13.8 501.515831\n", + "97 14.1 608.036899\n", + "98 14.4 722.340558\n", + "99 14.7 844.702516\n", + "100 15.0 972.762139\n", + "\n", + "[101 rows x 2 columns]" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "experiment_runner(s, std=1).experiment_data" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Defining The Theorist\n", + "\n", + "Now we define a theorist, which does a linear regression on the polynomial of degree 5. We define a regressor and a\n", + "method to return its feature names and coefficients, and then the theorist to handle it. Here, we use a different wrapper `theorist_from_estimator` that wraps the regressor and returns a function with the same functionality, but operating on `State` fields. In this case, we want to use the `State` field `experiment_data` and extend the `State` field `models`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Completely standard scikit-learn pipeline regressor\n", + "regressor = make_pipeline(PolynomialFeatures(degree=5), LinearRegression())\n", + "theorist = theorist_from_estimator(regressor)\n", + "\n", + "def get_equation(r):\n", + " t = r.named_steps['polynomialfeatures'].get_feature_names_out()\n", + " c = r.named_steps['linearregression'].coef_\n", + " return pd.DataFrame({\"t\": t, \"coefficient\": c.reshape(t.shape)})\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Directly Chaining State Based Functions\n", + "\n", + "Now we run the theorist on the result of the experiment_runner (by chaining the two functions)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "t = theorist(experiment_runner(s, rng=np.random.default_rng(1)))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The fitted coefficients are:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
tcoefficient
010.000000
1x-161.235264
2x^2-2.092934
3x^31.487881
4x^4-0.002423
5x^5-0.002523
\n", + "
" + ], + "text/plain": [ + " t coefficient\n", + "0 1 0.000000\n", + "1 x -161.235264\n", + "2 x^2 -2.092934\n", + "3 x^3 1.487881\n", + "4 x^4 -0.002423\n", + "5 x^5 -0.002523" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "get_equation(t.model)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Creating A Pipeline With State Based Functions\n", + "\n", + "Now we can define the simplest pipeline which runs the experiment runner and theorist in sequence and returns the\n", + "updated state:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def pipeline(state: State, rng=None) -> State:\n", + " s_ = state\n", + " t_ = experiment_runner(s_, rng=rng)\n", + " u_ = theorist(t_)\n", + " return u_" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Running this pipeline is the same as running the individual steps – just pass the state object." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
tcoefficient
010.000000
1x-161.235264
2x^2-2.092934
3x^31.487881
4x^4-0.002423
5x^5-0.002523
\n", + "
" + ], + "text/plain": [ + " t coefficient\n", + "0 1 0.000000\n", + "1 x -161.235264\n", + "2 x^2 -2.092934\n", + "3 x^3 1.487881\n", + "4 x^4 -0.002423\n", + "5 x^5 -0.002523" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "u = pipeline(s, rng=np.random.default_rng(1))\n", + "get_equation(u.model)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Since the pipeline function operates on the `State` itself and returns a `State`, we can chain these pipelines in the same fashion as we chain the theorist and experiment_runner:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
tcoefficient
010.000000
1x-169.149674
2x^22.803475
3x^31.617091
4x^4-0.022233
5x^5-0.002868
\n", + "
" + ], + "text/plain": [ + " t coefficient\n", + "0 1 0.000000\n", + "1 x -169.149674\n", + "2 x^2 2.803475\n", + "3 x^3 1.617091\n", + "4 x^4 -0.022233\n", + "5 x^5 -0.002868" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "u_ = pipeline(pipeline(s, rng=np.random.default_rng(1)))\n", + "get_equation(u_.model)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "To show what's happening, we'll show the data, best fit model and ground truth:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
tcoefficient
010.000000
1x-169.149674
2x^22.803475
3x^31.617091
4x^4-0.022233
5x^5-0.002868
\n", + "
" + ], + "text/plain": [ + " t coefficient\n", + "0 1 0.000000\n", + "1 x -169.149674\n", + "2 x^2 2.803475\n", + "3 x^3 1.617091\n", + "4 x^4 -0.022233\n", + "5 x^5 -0.002868" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "def show_best_fit(state):\n", + " state.experiment_data.plot.scatter(\"x\", \"y\", s=1, alpha=0.5, c=\"gray\")\n", + "\n", + " observed_x = state.experiment_data[[\"x\"]].sort_values(by=\"x\")\n", + " observed_x = pd.DataFrame({\"x\": np.linspace(observed_x[\"x\"].min(), observed_x[\"x\"].max(), 101)})\n", + "\n", + " plt.plot(observed_x, state.model.predict(observed_x), label=\"best fit\")\n", + " \n", + " allowed_x = pd.Series(np.linspace(*state.variables.independent_variables[0].value_range, 101), name=\"x\")\n", + " plt.plot(allowed_x, ground_truth(allowed_x), label=\"ground truth\")\n", + " \n", + " plt.legend()\n", + "\n", + "def show_coefficients(state):\n", + " return get_equation(state.model)\n", + "\n", + "show_best_fit(u)\n", + "show_coefficients(u)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can use this pipeline to make a trivial cycle, where we keep on gathering data until we reach 1000 datapoints. Any\n", + " condition defined on the state object could be used here, though." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "v = s\n", + "while len(v.experiment_data) < 1_000: # any condition on the state can be used here.\n", + " v = pipeline(v)\n", + "show_best_fit(v)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Creating Generators With State Based Functions\n", + "\n", + "We can redefine the pipeline as a generator, which can be operated on using iteration tools:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "def cycle(state: State) -> State:\n", + " s_ = state\n", + " while True:\n", + " s_ = experiment_runner(s_)\n", + " s_ = theorist(s_)\n", + " yield s_\n", + "\n", + "cycle_generator = cycle(s)\n", + "\n", + "for i in range(1000):\n", + " t = next(cycle_generator)\n", + "show_best_fit(t)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You can also define a cycle (or a sequence of steps) which yield the intermediate results." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "v0 = s\n", + "def cycle(state: State) -> State:\n", + " s_ = state\n", + " while True:\n", + " print(\"#-- running experiment_runner --#\\n\")\n", + " s_ = experiment_runner(s_)\n", + " yield s_\n", + " print(\"#-- running theorist --#\\n\")\n", + " s_ = theorist(s_)\n", + " yield s_\n", + "\n", + "cycle_generator = cycle(v0)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "At the outset, we have no model and an emtpy experiment_data dataframe." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "v0.model=None, \n", + "v0.experiment_data.shape=(0, 2)\n" + ] + } + ], + "source": [ + "print(f\"{v0.model=}, \\n{v0.experiment_data.shape=}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In the first `next`, we only run the \"experiment_runner\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "#-- running experiment_runner --#\n", + "\n", + "v1.model=None, \n", + "v1.experiment_data.shape=(101, 2)\n" + ] + } + ], + "source": [ + "v1 = next(cycle_generator)\n", + "print(f\"{v1.model=}, \\n{v1.experiment_data.shape=}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In the next step, we run the theorist on that data, but we don't add any new data:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "#-- running theorist --#\n", + "\n", + "v2.model=Pipeline(steps=[('polynomialfeatures', PolynomialFeatures(degree=5)),\n", + " ('linearregression', LinearRegression())]), \n", + "v2.experiment_data.shape=(101, 2)\n" + ] + } + ], + "source": [ + "v2 = next(cycle_generator)\n", + "print(f\"{v2.model=}, \\n{v2.experiment_data.shape=}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In the next step, we run the experiment runner again and gather more observations:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "#-- running experiment_runner --#\n", + "\n", + "v3.model=Pipeline(steps=[('polynomialfeatures', PolynomialFeatures(degree=5)),\n", + " ('linearregression', LinearRegression())]), \n", + "v3.experiment_data.shape=(202, 2)\n" + ] + } + ], + "source": [ + "v3 = next(cycle_generator)\n", + "print(f\"{v3.model=}, \\n{v3.experiment_data.shape=}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Adding The Experimentalist\n", + "Modifying the code to use a custom experimentalist is simple.\n", + "We define an experimentalist which adds four observations each cycle:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Snapshot(variables=VariableCollection(independent_variables=[Variable(name='x', value_range=(-15, 15), allowed_values=None, units='', type=, variable_label='', rescale=1, is_covariate=False)], dependent_variables=[Variable(name='y', value_range=None, allowed_values=None, units='', type=, variable_label='', rescale=1, is_covariate=False)], covariates=[]), params={}, experiment_data=Empty DataFrame\n", + "Columns: [x, y]\n", + "Index: [], conditions= x\n", + "0 2.281691, model=None)" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "experimentalist_rng = np.random.default_rng(180)\n", + "@wrap_to_use_state\n", + "\n", + "def experimentalist(variables: VariableCollection, n_samples=1):\n", + " names = [v.name for v in variables.independent_variables]\n", + " low = [v.value_range[0] for v in variables.independent_variables]\n", + " high = [v.value_range[1] for v in variables.independent_variables]\n", + " x_range = experimentalist_rng.uniform(low, high, size=n_samples)\n", + " conditions = pd.DataFrame({\"x\": x_range})\n", + " return Delta(conditions=conditions)\n", + "\n", + "experimentalist(s)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "u0 = s\n", + "for i in range(5):\n", + " u0 = experimentalist(u0, n_samples=10)\n", + " u0 = experiment_runner(u0)\n", + " u0 = theorist(u0)\n", + " show_best_fit(u0)\n", + " plt.title(f\"{i=}\")\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3" + } + }, + "nbformat": 4, + "nbformat_minor": 1 +} diff --git a/mkdocs.yml b/mkdocs.yml index b9a35944..dd9f7238 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -20,4 +20,5 @@ nav: - Random: - Home: 'experimentalists/sampler/random/index.md' - Quickstart: 'experimentalists/sampler/random/quickstart.md' - +- Cycle: + - Functional: 'cycle/Linear and Cyclical Workflows using Functions and States.ipynb' diff --git a/src/autora/state/delta.py b/src/autora/state/delta.py new file mode 100644 index 00000000..1a6fefaf --- /dev/null +++ b/src/autora/state/delta.py @@ -0,0 +1,321 @@ +"""Classes to represent cycle state $S$ as $S_n = S_{0} + \\sum_{i=1}^n \\Delta S_{i}""" +from __future__ import annotations + +import dataclasses +import inspect +from collections import UserDict +from dataclasses import dataclass, fields, replace +from functools import singledispatch, wraps +from typing import Generic, List, TypeVar + +import numpy as np +import pandas as pd + +S = TypeVar("S") +T = TypeVar("T") + + +@dataclass(frozen=True) +class State: + """ + Base object for dataclasses which use the Delta mechanism. + + Examples: + >>> from dataclasses import dataclass, field + + We define a dataclass where each field (which is going to be delta-ed) has additional + metadata "delta" which describes its delta behaviour. + >>> @dataclass(frozen=True) + ... class ListState(State): + ... l: List = field(default_factory=list, metadata={"delta": "extend"}) + ... m: List = field(default_factory=list, metadata={"delta": "replace"}) + + Now we instantiate the dataclass... + >>> l = ListState(l=list("abc"), m=list("xyz")) + >>> l + ListState(l=['a', 'b', 'c'], m=['x', 'y', 'z']) + + ... and can add deltas to it. `l` will be extended: + >>> l + Delta(l=list("def")) + ListState(l=['a', 'b', 'c', 'd', 'e', 'f'], m=['x', 'y', 'z']) + + ... wheras `m` will be replaced: + >>> l + Delta(m=list("uvw")) + ListState(l=['a', 'b', 'c'], m=['u', 'v', 'w']) + + ... they can be chained: + >>> l + Delta(l=list("def")) + Delta(m=list("uvw")) + ListState(l=['a', 'b', 'c', 'd', 'e', 'f'], m=['u', 'v', 'w']) + + ... and we update multiple fields with one Delta: + >>> l + Delta(l=list("ghi"), m=list("rst")) + ListState(l=['a', 'b', 'c', 'g', 'h', 'i'], m=['r', 's', 't']) + + Passing a nonexistent field will cause an error: + >>> l + Delta(o="not a field") + Traceback (most recent call last): + ... + AttributeError: key=`o` is missing on ListState(l=['a', 'b', 'c'], m=['x', 'y', 'z']) + + We can also use the `.update` method to do the same thing: + >>> l.update(l=list("ghi"), m=list("rst")) + ListState(l=['a', 'b', 'c', 'g', 'h', 'i'], m=['r', 's', 't']) + + We can also define fields which `append` the last result: + >>> @dataclass(frozen=True) + ... class AppendState(State): + ... n: List = field(default_factory=list, metadata={"delta": "append"}) + + >>> m = AppendState(n=list("ɑβɣ")) + >>> m + AppendState(n=['ɑ', 'β', 'ɣ']) + + `n` will be appended: + >>> m + Delta(n="∂") + AppendState(n=['ɑ', 'β', 'ɣ', '∂']) + + """ + + def __add__(self, other: Delta): + updates = dict() + for key, other_value in other.data.items(): + try: + self_field = next(filter(lambda f: f.name == key, fields(self))) + except StopIteration: + raise AttributeError("key=`%s` is missing on %s" % (key, self)) + delta_behavior = self_field.metadata["delta"] + self_value = getattr(self, key) + if delta_behavior == "extend": + extended_value = extend(self_value, other_value) + updates[key] = extended_value + elif delta_behavior == "append": + appended_value = append(self_value, other_value) + updates[key] = appended_value + elif delta_behavior == "replace": + updates[key] = other_value + else: + raise NotImplementedError( + "delta_behaviour=`%s` not implemented" % (delta_behavior) + ) + + new = replace(self, **updates) + return new + + def update(self, **kwargs): + return self + Delta(**kwargs) + + +class Delta(UserDict, Generic[S]): + """ + Represents a delta where the base object determines the extension behavior. + + Examples: + >>> from dataclasses import dataclass + + First we define the dataclass to act as the basis: + >>> from typing import Optional, List + >>> @dataclass(frozen=True) + ... class ListState: + ... l: Optional[List] = None + ... m: Optional[List] = None + ... + """ + + pass + + +Result = Delta +"""`Result` is an alias for `Delta`.""" + + +@singledispatch +def extend(a, b): + """ + Function to extend supported datatypes. + + """ + raise NotImplementedError("`extend` not implemented for %s, %s" % (a, b)) + + +@extend.register(list) +def extend_list(a, b): + """ + Examples: + >>> extend([], []) + [] + + >>> extend([1,2], [3]) + [1, 2, 3] + """ + return a + b + + +@extend.register(pd.DataFrame) +def extend_pd_dataframe(a, b): + """ + Examples: + >>> extend(pd.DataFrame({"a": []}), pd.DataFrame({"a": []})) + Empty DataFrame + Columns: [a] + Index: [] + + >>> extend(pd.DataFrame({"a": [1,2,3]}), pd.DataFrame({"a": [4,5,6]})) + a + 0 1 + 1 2 + 2 3 + 3 4 + 4 5 + 5 6 + """ + return pd.concat((a, b), ignore_index=True) + + +def append(a: List[T], b: T) -> List[T]: + # TODO: add DOCTESTS + return a + [b] + + +@extend.register(np.ndarray) +def extend_np_ndarray(a, b): + """ + Examples: + >>> extend(np.array([(1,2,3), (4,5,6)]), np.array([(7,8,9)])) + array([[1, 2, 3], + [4, 5, 6], + [7, 8, 9]]) + """ + return np.row_stack([a, b]) + + +@extend.register(dict) +def extend_dict(a, b): + """ + Examples: + >>> extend({"a": "cats"}, {"b": "dogs"}) + {'a': 'cats', 'b': 'dogs'} + """ + return dict(a, **b) + + +def wrap_to_use_state(f): + """Decorator to make target `f` into a function on a `State` and `**kwargs`. + + This wrapper makes it easier to pass arguments to a function from a State. + + It was inspired by the pytest "fixtures" mechanism. + + Args: + f: + + Returns: + + Examples: + >>> from autora.state.delta import State, Delta + >>> from dataclasses import dataclass, field + >>> import pandas as pd + >>> from typing import List, Optional + + The `State` it operates on needs to have the metadata described in the state module: + >>> @dataclass(frozen=True) + ... class S(State): + ... conditions: List[int] = field(metadata={"delta": "replace"}) + + We indicate the inputs required by the parameter names. + The output must be a `Delta` object. + >>> from autora.state.delta import Delta + >>> @wrap_to_use_state + ... def experimentalist(conditions): + ... new_conditions = [c + 10 for c in conditions] + ... return Delta(conditions=new_conditions) + + >>> experimentalist(S(conditions=[1,2,3,4])) + S(conditions=[11, 12, 13, 14]) + + >>> experimentalist(S(conditions=[101,102,103,104])) + S(conditions=[111, 112, 113, 114]) + + >>> from autora.variable import VariableCollection, Variable + >>> from sklearn.base import BaseEstimator + >>> from sklearn.linear_model import LinearRegression + + >>> @wrap_to_use_state + ... def theorist(experiment_data: pd.DataFrame, variables: VariableCollection, **kwargs): + ... ivs = [v.name for v in variables.independent_variables] + ... dvs = [v.name for v in variables.dependent_variables] + ... X, y = experiment_data[ivs], experiment_data[dvs] + ... new_model = LinearRegression(fit_intercept=True).set_params(**kwargs).fit(X, y) + ... return Delta(model=new_model) + + >>> @dataclass(frozen=True) + ... class T(State): + ... variables: VariableCollection # field(metadata={"delta":... }) omitted ∴ immutable + ... experiment_data: pd.DataFrame = field(metadata={"delta": "extend"}) + ... model: Optional[BaseEstimator] = field(metadata={"delta": "replace"}, default=None) + + >>> t = T( + ... variables=VariableCollection(independent_variables=[Variable("x")], + ... dependent_variables=[Variable("y")]), + ... experiment_data=pd.DataFrame({"x": [0,1,2,3,4], "y": [2,3,4,5,6]}) + ... ) + >>> t_prime = theorist(t) + >>> t_prime.model.coef_, t_prime.model.intercept_ + (array([[1.]]), array([2.])) + + Arguments from the state can be overridden by passing them in as keyword arguments (kwargs): + >>> theorist(t, experiment_data=pd.DataFrame({"x": [0,1,2,3], "y": [12,13,14,15]}))\\ + ... .model.intercept_ + array([12.]) + + ... and other arguments supported by the inner function can also be passed + (if and only if the inner function allows for and handles `**kwargs` arguments alongside + the values from the state). + >>> theorist(t, fit_intercept=False).model.intercept_ + 0.0 + + Any parameters not provided by the state must be provided by default values or by the + caller. If the default is specified: + >>> @wrap_to_use_state + ... def experimentalist(conditions, offset=25): + ... new_conditions = [c + offset for c in conditions] + ... return Delta(conditions=new_conditions) + + ... then it need not be passed. + >>> experimentalist(S(conditions=[1,2,3,4])) + S(conditions=[26, 27, 28, 29]) + + If a default isn't specified: + >>> @wrap_to_use_state + ... def experimentalist(conditions, offset): + ... new_conditions = [c + offset for c in conditions] + ... return Delta(conditions=new_conditions) + + ... then calling the experimentalist without it will throw an error: + >>> experimentalist(S(conditions=[1,2,3,4])) + Traceback (most recent call last): + ... + TypeError: experimentalist() missing 1 required positional argument: 'offset' + + ... which can be fixed by passing the argument as a keyword to the wrapped function. + >>> experimentalist(S(conditions=[1,2,3,4]), offset=2) + S(conditions=[3, 4, 5, 6]) + + """ + # Get the set of parameter names from function f's signature + parameters_ = set(inspect.signature(f).parameters.keys()) + + @wraps(f) + def _f(state_: S, /, **kwargs) -> S: + # Get the parameters needed which are available from the state_. + # All others must be provided as kwargs or default values on f. + assert dataclasses.is_dataclass(state_) + from_state = parameters_.intersection( + {i.name for i in dataclasses.fields(state_)} + ) + arguments_from_state = {k: getattr(state_, k) for k in from_state} + arguments = dict(arguments_from_state, **kwargs) + delta = f(**arguments) + new_state = state_ + delta + return new_state + + return _f diff --git a/src/autora/state/history.py b/src/autora/state/history.py new file mode 100644 index 00000000..fbb33944 --- /dev/null +++ b/src/autora/state/history.py @@ -0,0 +1,722 @@ +""" Classes for storing and passing a cycle's state as an immutable history. """ +from __future__ import annotations + +from dataclasses import dataclass +from typing import Any, Dict, Iterable, List, Optional, Sequence, Set, Union + +from numpy.typing import ArrayLike +from sklearn.base import BaseEstimator + +from autora.state.delta import Delta +from autora.state.protocol import ( + ResultKind, + SupportsControllerStateHistory, + SupportsDataKind, +) +from autora.state.snapshot import Snapshot +from autora.variable import VariableCollection + + +class History(SupportsControllerStateHistory): + """ + An immutable object for tracking the state and history of an AER cycle. + """ + + def __init__( + self, + variables: Optional[VariableCollection] = None, + params: Optional[Dict] = None, + conditions: Optional[List[ArrayLike]] = None, + observations: Optional[List[ArrayLike]] = None, + models: Optional[List[BaseEstimator]] = None, + history: Optional[Sequence[Result]] = None, + ): + """ + + Args: + variables: a single datum to be marked as "variables" + params: a single datum to be marked as "params" + conditions: an iterable of data, each to be marked as "conditions" + observations: an iterable of data, each to be marked as "observations" + models: an iterable of data, each to be marked as "models" + history: an iterable of Result objects to be used as the initial history. + + Examples: + Empty input leads to an empty state: + >>> History() + History([]) + + ... or with values for any or all of the parameters: + >>> from autora.variable import VariableCollection + >>> History(variables=VariableCollection()) # doctest: +ELLIPSIS + History([Result(data=VariableCollection(...), kind=ResultKind.VARIABLES)]) + + >>> History(params={"some": "params"}) + History([Result(data={'some': 'params'}, kind=ResultKind.PARAMS)]) + + >>> History(conditions=["a condition"]) + History([Result(data='a condition', kind=ResultKind.CONDITION)]) + + >>> History(observations=["an observation"]) + History([Result(data='an observation', kind=ResultKind.OBSERVATION)]) + + >>> from sklearn.linear_model import LinearRegression + >>> History(models=[LinearRegression()]) + History([Result(data=LinearRegression(), kind=ResultKind.MODEL)]) + + Parameters passed to the constructor are included in the history in the following order: + `history`, `variables`, `params`, `conditions`, `observations`, `models` + >>> History(models=['m1', 'm2'], conditions=['c1', 'c2'], + ... observations=['o1', 'o2'], params={'a': 'param'}, + ... variables=VariableCollection(), + ... history=[Result("from history", ResultKind.VARIABLES)] + ... ) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE + History([Result(data='from history', kind=ResultKind.VARIABLES), + Result(data=VariableCollection(...), kind=ResultKind.VARIABLES), + Result(data={'a': 'param'}, kind=ResultKind.PARAMS), + Result(data='c1', kind=ResultKind.CONDITION), + Result(data='c2', kind=ResultKind.CONDITION), + Result(data='o1', kind=ResultKind.OBSERVATION), + Result(data='o2', kind=ResultKind.OBSERVATION), + Result(data='m1', kind=ResultKind.MODEL), + Result(data='m2', kind=ResultKind.MODEL)]) + """ + self.data: List + + if history is not None: + self.data = list(history) + else: + self.data = [] + + self.data += _init_result_list( + variables=variables, + params=params, + conditions=conditions, + observations=observations, + models=models, + ) + + def update( + self, + variables=None, + params=None, + conditions=None, + observations=None, + models=None, + history=None, + ): + """ + Create a new object with updated values. + + Examples: + The initial object is empty: + >>> h0 = History() + >>> h0 + History([]) + + We can update the variables using the `.update` method: + >>> from autora.variable import VariableCollection + >>> h1 = h0.update(variables=VariableCollection()) + >>> h1 # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE + History([Result(data=VariableCollection(...), kind=ResultKind.VARIABLES)]) + + ... the original object is unchanged: + >>> h0 + History([]) + + We can update the variables again: + >>> h2 = h1.update(variables=VariableCollection(["some IV"])) + >>> h2._by_kind # doctest: +ELLIPSIS + Snapshot(variables=VariableCollection(independent_variables=['some IV'],...), ...) + + ... and we see that there is only ever one variables object returned. + + Params is treated the same way as variables: + >>> hp = h0.update(params={'first': 'params'}) + >>> hp + History([Result(data={'first': 'params'}, kind=ResultKind.PARAMS)]) + + ... where only the most recent "params" object is returned from the `.params` property. + >>> hp = hp.update(params={'second': 'params'}) + >>> hp.params + {'second': 'params'} + + ... however, the full history of the params objects remains available, if needed: + >>> hp # doctest: +NORMALIZE_WHITESPACE + History([Result(data={'first': 'params'}, kind=ResultKind.PARAMS), + Result(data={'second': 'params'}, kind=ResultKind.PARAMS)]) + + When we update the conditions, observations or models, a new entry is added to the + history: + >>> h3 = h0.update(models=["1st model"]) + >>> h3 # doctest: +NORMALIZE_WHITESPACE + History([Result(data='1st model', kind=ResultKind.MODEL)]) + + ... so we can see the history of all the models, for instance. + >>> h3 = h3.update(models=["2nd model"]) # doctest: +NORMALIZE_WHITESPACE + >>> h3 # doctest: +NORMALIZE_WHITESPACE + History([Result(data='1st model', kind=ResultKind.MODEL), + Result(data='2nd model', kind=ResultKind.MODEL)]) + + ... and the full history of models is available using the `.models` parameter: + >>> h3.models + ['1st model', '2nd model'] + + The same for the observations: + >>> h4 = h0.update(observations=["1st observation"]) + >>> h4 + History([Result(data='1st observation', kind=ResultKind.OBSERVATION)]) + + >>> h4.update(observations=["2nd observation"] + ... ) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE + History([Result(data='1st observation', kind=ResultKind.OBSERVATION), + Result(data='2nd observation', kind=ResultKind.OBSERVATION)]) + + + The same for the conditions: + >>> h5 = h0.update(conditions=["1st condition"]) + >>> h5 + History([Result(data='1st condition', kind=ResultKind.CONDITION)]) + + >>> h5.update(conditions=["2nd condition"]) # doctest: +NORMALIZE_WHITESPACE + History([Result(data='1st condition', kind=ResultKind.CONDITION), + Result(data='2nd condition', kind=ResultKind.CONDITION)]) + + You can also update with multiple conditions, observations and models: + >>> h0.update(conditions=['c1', 'c2']) # doctest: +NORMALIZE_WHITESPACE + History([Result(data='c1', kind=ResultKind.CONDITION), + Result(data='c2', kind=ResultKind.CONDITION)]) + + >>> h0.update(models=['m1', 'm2'], variables={'m': 1} + ... ) # doctest: +NORMALIZE_WHITESPACE + History([Result(data={'m': 1}, kind=ResultKind.VARIABLES), + Result(data='m1', kind=ResultKind.MODEL), + Result(data='m2', kind=ResultKind.MODEL)]) + + >>> h0.update(models=['m1'], observations=['o1'], variables={'m': 1} + ... ) # doctest: +NORMALIZE_WHITESPACE + History([Result(data={'m': 1}, kind=ResultKind.VARIABLES), + Result(data='o1', kind=ResultKind.OBSERVATION), + Result(data='m1', kind=ResultKind.MODEL)]) + + We can also update with a complete history: + >>> History().update(history=[Result(data={'m': 2}, kind=ResultKind.VARIABLES), + ... Result(data='o1', kind=ResultKind.OBSERVATION), + ... Result(data='m1', kind=ResultKind.MODEL)], + ... conditions=['c1'] + ... ) # doctest: +NORMALIZE_WHITESPACE + History([Result(data={'m': 2}, kind=ResultKind.VARIABLES), + Result(data='o1', kind=ResultKind.OBSERVATION), + Result(data='m1', kind=ResultKind.MODEL), + Result(data='c1', kind=ResultKind.CONDITION)]) + + """ + + if history is not None: + history_extension = history + else: + history_extension = [] + + history_extension += _init_result_list( + variables=variables, + params=params, + conditions=conditions, + observations=observations, + models=models, + ) + new_full_history = self.data + history_extension + + return History(history=new_full_history) + + def __add__(self, other: Delta): + """The initial object is empty: + >>> h0 = History() + >>> h0 + History([]) + + We can update the variables using the `.update` method: + >>> from autora.variable import VariableCollection + >>> h1 = h0 + Delta(variables=VariableCollection()) + >>> h1 # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE + History([Result(data=VariableCollection(...), kind=ResultKind.VARIABLES)]) + + ... the original object is unchanged: + >>> h0 + History([]) + + We can update the variables again: + >>> h2 = h1 + Delta(variables=VariableCollection(["some IV"])) + >>> h2._by_kind # doctest: +ELLIPSIS + Snapshot(variables=VariableCollection(independent_variables=['some IV'],...), ...) + + ... and we see that there is only ever one variables object returned. + + Params is treated the same way as variables: + >>> hp = h0 + Delta(params={'first': 'params'}) + >>> hp + History([Result(data={'first': 'params'}, kind=ResultKind.PARAMS)]) + + ... where only the most recent "params" object is returned from the `.params` property. + >>> hp = hp + Delta(params={'second': 'params'}) + >>> hp.params + {'second': 'params'} + + ... however, the full history of the params objects remains available, if needed: + >>> hp # doctest: +NORMALIZE_WHITESPACE + History([Result(data={'first': 'params'}, kind=ResultKind.PARAMS), + Result(data={'second': 'params'}, kind=ResultKind.PARAMS)]) + + When we update the conditions, observations or models, a new entry is added to the + history: + >>> h3 = h0 + Delta(models=["1st model"]) + >>> h3 # doctest: +NORMALIZE_WHITESPACE + History([Result(data='1st model', kind=ResultKind.MODEL)]) + + ... so we can see the history of all the models, for instance. + >>> h3 = h3 + Delta(models=["2nd model"]) # doctest: +NORMALIZE_WHITESPACE + >>> h3 # doctest: +NORMALIZE_WHITESPACE + History([Result(data='1st model', kind=ResultKind.MODEL), + Result(data='2nd model', kind=ResultKind.MODEL)]) + + ... and the full history of models is available using the `.models` parameter: + >>> h3.models + ['1st model', '2nd model'] + + The same for the observations: + >>> h4 = h0 + Delta(observations=["1st observation"]) + >>> h4 + History([Result(data='1st observation', kind=ResultKind.OBSERVATION)]) + + >>> h4 + Delta(observations=["2nd observation"] + ... ) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE + History([Result(data='1st observation', kind=ResultKind.OBSERVATION), + Result(data='2nd observation', kind=ResultKind.OBSERVATION)]) + + + The same for the conditions: + >>> h5 = h0 + Delta(conditions=["1st condition"]) + >>> h5 + History([Result(data='1st condition', kind=ResultKind.CONDITION)]) + + >>> h5 + Delta(conditions=["2nd condition"]) # doctest: +NORMALIZE_WHITESPACE + History([Result(data='1st condition', kind=ResultKind.CONDITION), + Result(data='2nd condition', kind=ResultKind.CONDITION)]) + + You can also update with multiple conditions, observations and models: + >>> h0 + Delta(conditions=['c1', 'c2']) # doctest: +NORMALIZE_WHITESPACE + History([Result(data='c1', kind=ResultKind.CONDITION), + Result(data='c2', kind=ResultKind.CONDITION)]) + + >>> h0 + Delta(models=['m1', 'm2'], variables={'m': 1} + ... ) # doctest: +NORMALIZE_WHITESPACE + History([Result(data={'m': 1}, kind=ResultKind.VARIABLES), + Result(data='m1', kind=ResultKind.MODEL), + Result(data='m2', kind=ResultKind.MODEL)]) + + >>> h0 + Delta(models=['m1'], observations=['o1'], variables={'m': 1} + ... ) # doctest: +NORMALIZE_WHITESPACE + History([Result(data={'m': 1}, kind=ResultKind.VARIABLES), + Result(data='o1', kind=ResultKind.OBSERVATION), + Result(data='m1', kind=ResultKind.MODEL)]) + + We can also update with a complete history: + >>> History() + Delta(history=[Result(data={'m': 2}, kind=ResultKind.VARIABLES), + ... Result(data='o1', kind=ResultKind.OBSERVATION), + ... Result(data='m1', kind=ResultKind.MODEL)], + ... conditions=['c1'] + ... ) # doctest: +NORMALIZE_WHITESPACE + History([Result(data={'m': 2}, kind=ResultKind.VARIABLES), + Result(data='o1', kind=ResultKind.OBSERVATION), + Result(data='m1', kind=ResultKind.MODEL), + Result(data='c1', kind=ResultKind.CONDITION)]) + """ + return self.update(**other) + + def __repr__(self): + return f"{type(self).__name__}({self.history})" + + @property + def _by_kind(self): + return _history_to_kind(self.data) + + @property + def variables(self) -> VariableCollection: + """ + + Examples: + The initial object is empty: + >>> h = History() + + ... and returns an emtpy variables object + >>> h.variables + VariableCollection(independent_variables=[], dependent_variables=[], covariates=[]) + + We can update the variables using the `.update` method: + >>> from autora.variable import VariableCollection + >>> h = h.update(variables=VariableCollection(independent_variables=['some IV'])) + >>> h.variables # doctest: +ELLIPSIS + VariableCollection(independent_variables=['some IV'], ...) + + We can update the variables again: + >>> h = h.update(variables=VariableCollection(["some other IV"])) + >>> h.variables # doctest: +ELLIPSIS + VariableCollection(independent_variables=['some other IV'], ...) + + ... and we see that there is only ever one variables object returned.""" + return self._by_kind.variables + + @property + def params(self) -> Dict: + """ + + Returns: + + Examples: + Params is treated the same way as variables: + >>> h = History() + >>> h = h.update(params={'first': 'params'}) + >>> h.params + {'first': 'params'} + + ... where only the most recent "params" object is returned from the `.params` property. + >>> h = h.update(params={'second': 'params'}) + >>> h.params + {'second': 'params'} + + ... however, the full history of the params objects remains available, if needed: + >>> h # doctest: +NORMALIZE_WHITESPACE + History([Result(data={'first': 'params'}, kind=ResultKind.PARAMS), + Result(data={'second': 'params'}, kind=ResultKind.PARAMS)]) + """ + return self._by_kind.params + + @property + def conditions(self) -> List[ArrayLike]: + """ + Returns: + + Examples: + View the sequence of models with one conditions: + >>> h = History(conditions=[(1,2,3,)]) + >>> h.conditions + [(1, 2, 3)] + + ... or more conditions: + >>> h = h.update(conditions=[(4,5,6),(7,8,9)]) # doctest: +NORMALIZE_WHITESPACE + >>> h.conditions + [(1, 2, 3), (4, 5, 6), (7, 8, 9)] + + """ + return self._by_kind.conditions + + @property + def observations(self) -> List[ArrayLike]: + """ + + Returns: + + Examples: + The sequence of all observations is returned + >>> h = History(observations=["1st observation"]) + >>> h.observations + ['1st observation'] + + >>> h = h.update(observations=["2nd observation"]) + >>> h.observations # doctest: +ELLIPSIS + ['1st observation', '2nd observation'] + + """ + return self._by_kind.observations + + @property + def models(self) -> List[BaseEstimator]: + """ + + Returns: + + Examples: + View the sequence of models with one model: + >>> s = History(models=["1st model"]) + >>> s.models # doctest: +NORMALIZE_WHITESPACE + ['1st model'] + + ... or more models: + >>> s = s.update(models=["2nd model"]) # doctest: +NORMALIZE_WHITESPACE + >>> s.models + ['1st model', '2nd model'] + + """ + return self._by_kind.models + + @property + def history(self) -> List[Result]: + """ + + Examples: + We initialze some history: + >>> h = History(models=['m1', 'm2'], conditions=['c1', 'c2'], + ... observations=['o1', 'o2'], params={'a': 'param'}, + ... variables=VariableCollection(), + ... history=[Result("from history", ResultKind.VARIABLES)]) + + Parameters passed to the constructor are included in the history in the following order: + `history`, `variables`, `params`, `conditions`, `observations`, `models` + + >>> h.history # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE + [Result(data='from history', kind=ResultKind.VARIABLES), + Result(data=VariableCollection(...), kind=ResultKind.VARIABLES), + Result(data={'a': 'param'}, kind=ResultKind.PARAMS), + Result(data='c1', kind=ResultKind.CONDITION), + Result(data='c2', kind=ResultKind.CONDITION), + Result(data='o1', kind=ResultKind.OBSERVATION), + Result(data='o2', kind=ResultKind.OBSERVATION), + Result(data='m1', kind=ResultKind.MODEL), + Result(data='m2', kind=ResultKind.MODEL)] + + If we add a new value, like the params object, the updated value is added to the + end of the history: + >>> h = h.update(params={'new': 'param'}) + >>> h.history # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE + [..., Result(data={'new': 'param'}, kind=ResultKind.PARAMS)] + + """ + return self.data + + def filter_by(self, kind: Optional[Set[Union[str, ResultKind]]] = None) -> History: + """ + Return a copy of the object with only data belonging to the specified kinds. + + Examples: + >>> h = History(models=['m1', 'm2'], conditions=['c1', 'c2'], + ... observations=['o1', 'o2'], params={'a': 'param'}, + ... variables=VariableCollection(), + ... history=[Result("from history", ResultKind.VARIABLES)]) + + >>> h.filter_by(kind={"MODEL"}) # doctest: +NORMALIZE_WHITESPACE + History([Result(data='m1', kind=ResultKind.MODEL), + Result(data='m2', kind=ResultKind.MODEL)]) + + >>> h.filter_by(kind={ResultKind.OBSERVATION}) # doctest: +NORMALIZE_WHITESPACE + History([Result(data='o1', kind=ResultKind.OBSERVATION), + Result(data='o2', kind=ResultKind.OBSERVATION)]) + + If we don't specify any filter criteria, we get the full history back: + >>> h.filter_by() # doctest: +NORMALIZE_WHITESPACE +ELLIPSIS + History([Result(data='from history', kind=ResultKind.VARIABLES), + Result(data=VariableCollection(...), kind=ResultKind.VARIABLES), + Result(data={'a': 'param'}, kind=ResultKind.PARAMS), + Result(data='c1', kind=ResultKind.CONDITION), + Result(data='c2', kind=ResultKind.CONDITION), + Result(data='o1', kind=ResultKind.OBSERVATION), + Result(data='o2', kind=ResultKind.OBSERVATION), + Result(data='m1', kind=ResultKind.MODEL), + Result(data='m2', kind=ResultKind.MODEL)]) + + """ + if kind is None: + return self + else: + kind_ = {ResultKind(s) for s in kind} + filtered_history = _filter_history(self.data, kind_) + new_object = History(history=filtered_history) + return new_object + + +@dataclass(frozen=True) +class Result(SupportsDataKind): + """ + Container class for data and variables. + + Examples: + >>> Result() + Result(data=None, kind=None) + + >>> Result("a") + Result(data='a', kind=None) + + >>> Result(None, "MODEL") + Result(data=None, kind=ResultKind.MODEL) + + >>> Result(data="b") + Result(data='b', kind=None) + + >>> Result("c", "OBSERVATION") + Result(data='c', kind=ResultKind.OBSERVATION) + """ + + data: Optional[Any] = None + kind: Optional[ResultKind] = None + + def __post_init__(self): + if isinstance(self.kind, str): + object.__setattr__(self, "kind", ResultKind(self.kind)) + + +def _init_result_list( + variables: Optional[VariableCollection] = None, + params: Optional[Dict] = None, + conditions: Optional[Iterable[ArrayLike]] = None, + observations: Optional[Iterable[ArrayLike]] = None, + models: Optional[Iterable[BaseEstimator]] = None, +) -> List[Result]: + """ + Initialize a list of Result objects + + Returns: + + Args: + variables: a single datum to be marked as "variables" + params: a single datum to be marked as "params" + conditions: an iterable of data, each to be marked as "conditions" + observations: an iterable of data, each to be marked as "observations" + models: an iterable of data, each to be marked as "models" + + Examples: + Empty input leads to an empty state: + >>> _init_result_list() + [] + + ... or with values for any or all of the parameters: + >>> from autora.variable import VariableCollection + >>> _init_result_list(variables=VariableCollection()) # doctest: +ELLIPSIS + [Result(data=VariableCollection(...), kind=ResultKind.VARIABLES)] + + >>> _init_result_list(params={"some": "params"}) + [Result(data={'some': 'params'}, kind=ResultKind.PARAMS)] + + >>> _init_result_list(conditions=["a condition"]) + [Result(data='a condition', kind=ResultKind.CONDITION)] + + >>> _init_result_list(observations=["an observation"]) + [Result(data='an observation', kind=ResultKind.OBSERVATION)] + + >>> from sklearn.linear_model import LinearRegression + >>> _init_result_list(models=[LinearRegression()]) + [Result(data=LinearRegression(), kind=ResultKind.MODEL)] + + The input arguments are added to the data in the order `variables`, + `params`, `conditions`, `observations`, `models`: + >>> _init_result_list(variables=VariableCollection(), + ... params={"some": "params"}, + ... conditions=["a condition"], + ... observations=["an observation", "another observation"], + ... models=[LinearRegression()], + ... ) # doctest: +NORMALIZE_WHITESPACE +ELLIPSIS + [Result(data=VariableCollection(...), kind=ResultKind.VARIABLES), + Result(data={'some': 'params'}, kind=ResultKind.PARAMS), + Result(data='a condition', kind=ResultKind.CONDITION), + Result(data='an observation', kind=ResultKind.OBSERVATION), + Result(data='another observation', kind=ResultKind.OBSERVATION), + Result(data=LinearRegression(), kind=ResultKind.MODEL)] + + """ + data = [] + + if variables is not None: + data.append(Result(variables, ResultKind.VARIABLES)) + + if params is not None: + data.append(Result(params, ResultKind.PARAMS)) + + for seq, kind in [ + (conditions, ResultKind.CONDITION), + (observations, ResultKind.OBSERVATION), + (models, ResultKind.MODEL), + ]: + if seq is not None: + for i in seq: + data.append(Result(i, kind=kind)) + + return data + + +def _history_to_kind(history: Sequence[Result]) -> Snapshot: + """ + Convert a sequence of results into a Snapshot instance: + + Examples: + History might be empty + >>> history_ = [] + >>> _history_to_kind(history_) # doctest: +NORMALIZE_WHITESPACE +ELLIPSIS + Snapshot(variables=VariableCollection(...), params={}, + conditions=[], observations=[], models=[]) + + ... or with values for any or all of the parameters: + >>> history_ = _init_result_list(params={"some": "params"}) + >>> _history_to_kind(history_) # doctest: +NORMALIZE_WHITESPACE +ELLIPSIS + Snapshot(..., params={'some': 'params'}, ...) + + >>> history_ += _init_result_list(conditions=["a condition"]) + >>> _history_to_kind(history_) # doctest: +NORMALIZE_WHITESPACE +ELLIPSIS + Snapshot(..., params={'some': 'params'}, conditions=['a condition'], ...) + + >>> _history_to_kind(history_).params + {'some': 'params'} + + >>> history_ += _init_result_list(observations=["an observation"]) + >>> _history_to_kind(history_) # doctest: +NORMALIZE_WHITESPACE +ELLIPSIS + Snapshot(..., params={'some': 'params'}, conditions=['a condition'], + observations=['an observation'], ...) + + >>> from sklearn.linear_model import LinearRegression + >>> history_ = [Result(LinearRegression(), kind=ResultKind.MODEL)] + >>> _history_to_kind(history_) # doctest: +ELLIPSIS + Snapshot(..., models=[LinearRegression()]) + + >>> from autora.variable import VariableCollection, IV + >>> variables = VariableCollection(independent_variables=[IV(name="example")]) + >>> history_ = [Result(variables, kind=ResultKind.VARIABLES)] + >>> _history_to_kind(history_) # doctest: +ELLIPSIS + Snapshot(variables=VariableCollection(independent_variables=[IV(name='example', ... + + >>> history_ = [Result({'some': 'params'}, kind=ResultKind.PARAMS)] + >>> _history_to_kind(history_) # doctest: +ELLIPSIS + Snapshot(..., params={'some': 'params'}, ...) + + """ + namespace = Snapshot( + variables=_get_last_data_with_default( + history, kind={ResultKind.VARIABLES}, default=VariableCollection() + ), + params=_get_last_data_with_default( + history, kind={ResultKind.PARAMS}, default={} + ), + observations=_list_data( + _filter_history(history, kind={ResultKind.OBSERVATION}) + ), + models=_list_data(_filter_history(history, kind={ResultKind.MODEL})), + conditions=_list_data(_filter_history(history, kind={ResultKind.CONDITION})), + ) + return namespace + + +def _list_data(data: Sequence[SupportsDataKind]): + """ + Extract the `.data` attribute of each item in a sequence, and return as a list. + + Examples: + >>> _list_data([]) + [] + + >>> _list_data([Result("a"), Result("b")]) + ['a', 'b'] + """ + return list(r.data for r in data) + + +def _filter_history(data: Iterable[SupportsDataKind], kind: Set[ResultKind]): + return filter(lambda r: r.kind in kind, data) + + +def _get_last(data: Sequence[SupportsDataKind], kind: Set[ResultKind]): + results_new_to_old = reversed(data) + last_of_kind = next(_filter_history(results_new_to_old, kind=kind)) + return last_of_kind + + +def _get_last_data_with_default(data: Sequence[SupportsDataKind], kind, default): + try: + result = _get_last(data, kind).data + except StopIteration: + result = default + return result diff --git a/src/autora/state/param.py b/src/autora/state/param.py new file mode 100644 index 00000000..1fca3cfc --- /dev/null +++ b/src/autora/state/param.py @@ -0,0 +1,143 @@ +""" Functions for handling cycle-state-dependent parameters. """ +from __future__ import annotations + +import copy +import logging +from typing import Dict, Mapping + +import numpy as np + +from autora.state.protocol import SupportsControllerState +from autora.utils.deprecation import deprecate as deprecate +from autora.utils.dictionary import LazyDict + +_logger = logging.getLogger(__name__) + + +def _get_state_dependent_properties(state: SupportsControllerState): + """ + Examples: + Even with an empty data object, we can initialize the dictionary, + >>> from autora.state.snapshot import Snapshot + >>> state_dependent_properties = _get_state_dependent_properties(Snapshot()) + + ... but it will raise an exception if a value isn't yet available when we try to use it + >>> state_dependent_properties["%models[-1]%"] # doctest: +ELLIPSIS + Traceback (most recent call last): + ... + IndexError: list index out of range + + Nevertheless, we can iterate through its keys no problem: + >>> [key for key in state_dependent_properties.keys()] # doctest: +NORMALIZE_WHITESPACE + ['%observations.ivs[-1]%', '%observations.dvs[-1]%', '%observations.ivs%', + '%observations.dvs%', '%experiment_data.conditions[-1]%', + '%experiment_data.observations[-1]%', '%experiment_data.conditions%', + '%experiment_data.observations%', '%models[-1]%', '%models%'] + + """ + + n_ivs = len(state.variables.independent_variables) + n_dvs = len(state.variables.dependent_variables) + state_dependent_property_dict = LazyDict( + { + "%observations.ivs[-1]%": deprecate( + lambda: np.array(state.observations[-1])[:, 0:n_ivs], + "%observations.ivs[-1]% is deprecated, " + "use %experiment_data.conditions[-1]% instead.", + ), + "%observations.dvs[-1]%": deprecate( + lambda: np.array(state.observations[-1])[:, n_ivs:], + "%observations.dvs[-1]% is deprecated, " + "use %experiment_data.observations[-1]% instead.", + ), + "%observations.ivs%": deprecate( + lambda: np.row_stack( + [np.empty([0, n_ivs + n_dvs])] + list(state.observations) + )[:, 0:n_ivs], + "%observations.ivs% is deprecated, use %experiment_data.conditions% instead.", + ), + "%observations.dvs%": deprecate( + lambda: np.row_stack(state.observations)[:, n_ivs:], + "%observations.dvs% is deprecated, " + "use %experiment_data.observations% instead", + ), + "%experiment_data.conditions[-1]%": lambda: np.array( + state.observations[-1] + )[:, 0:n_ivs], + "%experiment_data.observations[-1]%": lambda: np.array( + state.observations[-1] + )[:, n_ivs:], + "%experiment_data.conditions%": lambda: np.row_stack( + [np.empty([0, n_ivs + n_dvs])] + list(state.observations) + )[:, 0:n_ivs], + "%experiment_data.observations%": lambda: np.row_stack(state.observations)[ + :, n_ivs: + ], + "%models[-1]%": lambda: state.models[-1], + "%models%": lambda: state.models, + } + ) + return state_dependent_property_dict + + +def _resolve_properties(params: Dict, state_dependent_properties: Mapping): + """ + Resolve state-dependent properties inside a nested dictionary. + + In this context, a state-dependent-property is a string which is meant to be replaced by its + updated, current value before the dictionary is used. A state-dependent property might be + something like "the last theorist available" or "all the experimental results until now". + + Args: + params: a (nested) dictionary of keys and values, where some values might be + "cycle property names" + state_dependent_properties: a dictionary of "property names" and their "real values" + + Returns: a (nested) dictionary where "property names" are replaced by the "real values" + + Examples: + + >>> params_0 = {"key": "%foo%"} + >>> cycle_properties_0 = {"%foo%": 180} + >>> _resolve_properties(params_0,cycle_properties_0) + {'key': 180} + + >>> params_1 = {"key": "%bar%", "nested_dict": {"inner_key": "%foobar%"}} + >>> cycle_properties_1 = {"%bar%": 1, "%foobar%": 2} + >>> _resolve_properties(params_1,cycle_properties_1) + {'key': 1, 'nested_dict': {'inner_key': 2}} + + >>> params_2 = {"key": "baz"} + >>> _resolve_properties(params_2,cycle_properties_1) + {'key': 'baz'} + + """ + params_ = copy.copy(params) + for key, value in params_.items(): + if isinstance(value, dict): + params_[key] = _resolve_properties(value, state_dependent_properties) + elif isinstance(value, str) and ( + value in state_dependent_properties + ): # value is a key in the cycle_properties dictionary + params_[key] = state_dependent_properties[value] + else: + _logger.debug(f"leaving {params=} unchanged") + + return params_ + + +def resolve_state_params(params: Dict, state: SupportsControllerState) -> Dict: + """ + Returns the `params` attribute of the input, with `cycle properties` resolved. + + Examples: + >>> from autora.state.history import History + >>> params = {"experimentalist": {"source": "%models[-1]%"}} + >>> s = History(models=["the first model", "the second model"]) + >>> resolve_state_params(params, s) + {'experimentalist': {'source': 'the second model'}} + + """ + state_dependent_properties = _get_state_dependent_properties(state) + resolved_params = _resolve_properties(params, state_dependent_properties) + return resolved_params diff --git a/src/autora/state/protocol.py b/src/autora/state/protocol.py new file mode 100644 index 00000000..e1a16be7 --- /dev/null +++ b/src/autora/state/protocol.py @@ -0,0 +1,158 @@ +from enum import Enum +from typing import ( + Any, + Dict, + Generic, + Mapping, + Optional, + Protocol, + Sequence, + Set, + TypeVar, + Union, + runtime_checkable, +) + +from numpy.typing import ArrayLike +from sklearn.base import BaseEstimator + +from autora.variable import VariableCollection + +State = TypeVar("State") + + +class ResultKind(str, Enum): + """ + Kinds of results which can be held in the Result object. + + Examples: + >>> ResultKind.CONDITION is ResultKind.CONDITION + True + + >>> ResultKind.CONDITION is ResultKind.VARIABLES + False + + >>> ResultKind.CONDITION == "CONDITION" + True + + >>> ResultKind.CONDITION == "VARIABLES" + False + + >>> ResultKind.CONDITION in {ResultKind.CONDITION, ResultKind.PARAMS} + True + + >>> ResultKind.VARIABLES in {ResultKind.CONDITION, ResultKind.PARAMS} + False + """ + + CONDITION = "CONDITION" + OBSERVATION = "OBSERVATION" + MODEL = "MODEL" + PARAMS = "PARAMS" + VARIABLES = "VARIABLES" + + def __repr__(self): + cls_name = self.__class__.__name__ + return f"{cls_name}.{self.name}" + + +class SupportsDataKind(Protocol): + """Object with attributes for `data` and `kind`""" + + data: Optional[Any] + kind: Optional[ResultKind] + + +class SupportsStateParamsField(Protocol): + """Support a state with a params property.""" + + params: Dict + + +class SupportsStateParamsProperty(Protocol): + """Support a state with a params property.""" + + @property + def params(self) -> Dict: + ... + + +SupportsStateParams = Union[SupportsStateParamsField, SupportsStateParamsProperty] + + +class SupportsControllerStateFields(Protocol): + """Support representing snapshots of a controller state as mutable fields.""" + + variables: VariableCollection + params: Dict + conditions: Sequence[ArrayLike] + observations: Sequence[ArrayLike] + models: Sequence[BaseEstimator] + + def update(self: State, **kwargs) -> State: + ... + + +class SupportsControllerStateProperties(Protocol): + """Support representing snapshots of a controller state as immutable properties.""" + + def update(self: State, **kwargs) -> State: + ... + + @property + def variables(self) -> VariableCollection: + ... + + @property + def params(self) -> Dict: + ... + + @property + def conditions(self) -> Sequence[ArrayLike]: + ... + + @property + def observations(self) -> Sequence[ArrayLike]: + ... + + @property + def models(self) -> Sequence[BaseEstimator]: + ... + + +SupportsControllerState = Union[ + SupportsControllerStateFields, SupportsControllerStateProperties +] + + +class SupportsControllerStateHistory(SupportsControllerStateProperties, Protocol): + """Represents controller state as a linear sequence of entries.""" + + def __init__(self, history: Sequence[SupportsDataKind]): + ... + + def filter_by(self: State, kind: Optional[Set[Union[str, ResultKind]]]) -> State: + ... + + @property + def history(self) -> Sequence[SupportsDataKind]: + ... + + +class Executor(Protocol, Generic[State]): + """A Callable which, given some state, and some parameters, returns an updated state.""" + + def __call__(self, __state: State, params: Dict) -> State: + ... + + +ExecutorCollection = Mapping[str, Executor] + + +@runtime_checkable +class SupportsLoadDump(Protocol): + def dump(self, data, file) -> None: + ... + + def load(self, file) -> Any: + ... diff --git a/src/autora/state/snapshot.py b/src/autora/state/snapshot.py new file mode 100644 index 00000000..21be8171 --- /dev/null +++ b/src/autora/state/snapshot.py @@ -0,0 +1,201 @@ +""" Classes for storing and passing a cycle's state as an immutable snapshot. """ +from dataclasses import dataclass, field +from typing import Dict, List + +from numpy.typing import ArrayLike +from sklearn.base import BaseEstimator + +from autora.state.delta import Delta +from autora.state.protocol import SupportsControllerStateFields +from autora.variable import VariableCollection + + +@dataclass(frozen=True) +class Snapshot(SupportsControllerStateFields): + """An object passed between and updated by processing steps in the Controller.""" + + # Single values + variables: VariableCollection = field(default_factory=VariableCollection) + params: Dict = field(default_factory=dict) + + # Sequences + conditions: List[ArrayLike] = field(default_factory=list) + observations: List[ArrayLike] = field(default_factory=list) + models: List[BaseEstimator] = field(default_factory=list) + + def update( + self, + variables=None, + params=None, + conditions=None, + observations=None, + models=None, + ): + """ + Create a new object with updated values. + + Examples: + The initial object is empty: + >>> s0 = Snapshot() + >>> s0 # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE + Snapshot(variables=VariableCollection(...), params={}, conditions=[], + observations=[], models=[]) + + We can update the params using the `.update` method: + >>> s0.update(params={'first': 'params'}) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE + Snapshot(..., params={'first': 'params'}, ...) + + ... but the original object is unchanged: + >>> s0 # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE + Snapshot(..., params={}, ...) + + For params, only one object is returned from the respective property: + >>> s0.update(params={'first': 'params'}).update(params={'second': 'params'}).params + {'second': 'params'} + + ... and the same applies to variables: + >>> from autora.variable import VariableCollection, IV + >>> (s0.update(variables=VariableCollection([IV("1st IV")])) + ... .update(variables=VariableCollection([IV("2nd IV")]))).variables + VariableCollection(independent_variables=[IV(name='2nd IV',...)], ...) + + When we update the conditions, observations or models, the respective list is extended: + >>> s3 = s0.update(models=["1st model"]) + >>> s3 + Snapshot(..., models=['1st model']) + + ... so we can see the history of all the models, for instance. + >>> s3.update(models=["2nd model"]) + Snapshot(..., models=['1st model', '2nd model']) + + The same applies to observations: + >>> s4 = s0.update(observations=["1st observation"]) + >>> s4 + Snapshot(..., observations=['1st observation'], ...) + + >>> s4.update(observations=["2nd observation"]) # doctest: +ELLIPSIS + Snapshot(..., observations=['1st observation', '2nd observation'], ...) + + + The same applies to conditions: + >>> s5 = s0.update(conditions=["1st condition"]) + >>> s5 + Snapshot(..., conditions=['1st condition'], ...) + + >>> s5.update(conditions=["2nd condition"]) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE + Snapshot(..., conditions=['1st condition', '2nd condition'], ...) + + You can also update with multiple conditions, observations and models: + >>> s0.update(conditions=['c1', 'c2']) + Snapshot(..., conditions=['c1', 'c2'], ...) + + >>> s0.update(models=['m1', 'm2'], variables={'m': 1}) + Snapshot(variables={'m': 1}, ..., models=['m1', 'm2']) + + >>> s0.update(models=['m1'], observations=['o1'], variables={'m': 1}) + Snapshot(variables={'m': 1}, ..., observations=['o1'], models=['m1']) + + + Inputs to models, observations and conditions must be Lists + which can be cast to lists: + >>> s0.update(models='m1') # doctest: +ELLIPSIS + Traceback (most recent call last): + ... + AssertionError: 'm1' must be a list, e.g. `['m1']`?) + + """ + + def _coalesce_lists(old, new): + assert isinstance( + old, List + ), f"{repr(old)} must be a list, e.g. `[{repr(old)}]`?)" + if new is not None: + assert isinstance( + new, List + ), f"{repr(new)} must be a list, e.g. `[{repr(new)}]`?)" + return old + list(new) + else: + return old + + variables_ = variables or self.variables + params_ = params or self.params + conditions_ = _coalesce_lists(self.conditions, conditions) + observations_ = _coalesce_lists(self.observations, observations) + models_ = _coalesce_lists(self.models, models) + return Snapshot(variables_, params_, conditions_, observations_, models_) + + def __add__(self, other: Delta): + """ + Add a delta to the object. + + Examples: + The initial object is empty: + >>> s0 = Snapshot() + >>> s0 # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE + Snapshot(variables=VariableCollection(...), params={}, conditions=[], + observations=[], models=[]) + + We can update the params using the `+` operator: + >>> from autora.state.delta import Delta + >>> s0 + Delta(params={'first': 'params'}) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE + Snapshot(..., params={'first': 'params'}, ...) + + ... but the original object is unchanged: + >>> s0 # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE + Snapshot(..., params={}, ...) + + For params, only one object is returned from the respective property: + >>> (s0 + Delta(params={'first': 'params'}) + Delta(params={'second':'params'})).params + {'second': 'params'} + + ... and the same applies to variables: + >>> from autora.variable import VariableCollection, IV + >>> (s0 + Delta(variables=VariableCollection([IV("1st IV")])) + + ... Delta(variables=VariableCollection([IV("2nd IV")]))).variables + VariableCollection(independent_variables=[IV(name='2nd IV',...)], ...) + + When we update the conditions, observations or models, the respective list is extended: + >>> s3 = s0 + Delta(models=["1st model"]) + >>> s3 + Snapshot(..., models=['1st model']) + + ... so we can see the history of all the models, for instance. + >>> s3 + Delta(models=["2nd model"]) + Snapshot(..., models=['1st model', '2nd model']) + + The same applies to observations: + >>> s4 = s0 + Delta(observations=["1st observation"]) + >>> s4 + Snapshot(..., observations=['1st observation'], ...) + + >>> s4 + Delta(observations=["2nd observation"]) # doctest: +ELLIPSIS + Snapshot(..., observations=['1st observation', '2nd observation'], ...) + + + The same applies to conditions: + >>> s5 = s0 + Delta(conditions=["1st condition"]) + >>> s5 + Snapshot(..., conditions=['1st condition'], ...) + + >>> s5 + Delta(conditions=["2nd condition"]) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE + Snapshot(..., conditions=['1st condition', '2nd condition'], ...) + + You can also update with multiple conditions, observations and models: + >>> s0 + Delta(conditions=['c1', 'c2']) + Snapshot(..., conditions=['c1', 'c2'], ...) + + >>> s0 + Delta(models=['m1', 'm2'], variables={'m': 1}) + Snapshot(variables={'m': 1}, ..., models=['m1', 'm2']) + + >>> s0 + Delta(models=['m1'], observations=['o1'], variables={'m': 1}) + Snapshot(variables={'m': 1}, ..., observations=['o1'], models=['m1']) + + + Inputs to models, observations and conditions must be Lists + which can be cast to lists: + >>> s0 + Delta(models='m1') # doctest: +ELLIPSIS + Traceback (most recent call last): + ... + AssertionError: 'm1' must be a list, e.g. `['m1']`?) + """ + return self.update(**other) diff --git a/src/autora/state/wrapper.py b/src/autora/state/wrapper.py new file mode 100644 index 00000000..13bf5528 --- /dev/null +++ b/src/autora/state/wrapper.py @@ -0,0 +1,87 @@ +"""Utilities to wrap common theorist, experimentalist and experiment runners as `f(State)`. +so that $n$ processes $f_i$ on states $S$ can be represented as +$$f_n(...(f_1(f_0(S))))$$ +""" +from __future__ import annotations + +from typing import Callable, Iterable, TypeVar + +import numpy as np +import pandas as pd +from sklearn.base import BaseEstimator + +from autora.experimentalist.pipeline import Pipeline +from autora.state.delta import Delta, State, wrap_to_use_state +from autora.variable import VariableCollection + +S = TypeVar("S") +X = TypeVar("X") +Y = TypeVar("Y") +XY = TypeVar("XY") +Executor = Callable[[State], State] + + +def theorist_from_estimator(estimator: BaseEstimator) -> Executor: + """ + Convert a scikit-learn compatible estimator into a function on a `State` object. + + Supports passing additional `**kwargs` which are used to update the estimator's params + before fitting. + """ + + @wrap_to_use_state + def theorist( + experiment_data: pd.DataFrame, variables: VariableCollection, **kwargs + ): + ivs = [v.name for v in variables.independent_variables] + dvs = [v.name for v in variables.dependent_variables] + X, y = experiment_data[ivs], experiment_data[dvs] + new_model = estimator.set_params(**kwargs).fit(X, y) + return Delta(model=new_model) + + return theorist + + +def experiment_runner_from_x_to_y_function(f: Callable[[X], Y]) -> Executor: + """Wrapper for experiment_runner of the form $f(x) \rarrow y$, where `f` returns just the $y$ + values""" + + @wrap_to_use_state + def experiment_runner(conditions: pd.DataFrame, **kwargs): + x = conditions + y = f(x, **kwargs) + experiment_data = pd.DataFrame.merge(x, y, left_index=True, right_index=True) + return Delta(experiment_data=experiment_data) + + return experiment_runner + + +def experiment_runner_from_x_to_xy_function(f: Callable[[X], XY]) -> Executor: + """Wrapper for experiment_runner of the form $f(x) \rarrow (x,y)$, where `f` + returns both $x$ and $y$ values in a complete dataframe.""" + + @wrap_to_use_state + def experiment_runner(conditions: pd.DataFrame, **kwargs): + x = conditions + experiment_data = f(x, **kwargs) + return Delta(experiment_data=experiment_data) + + return experiment_runner + + +def experimentalist_from_pipeline(pipeline: Pipeline) -> Executor: + """Wrapper for experimentalists of the form $f() \rarrow x$, where `f` + returns both $x$ and $y$ values in a complete dataframe.""" + + @wrap_to_use_state + def experimentalist(params): + conditions = pipeline(**params) + if isinstance(conditions, (pd.DataFrame, np.ndarray, np.recarray)): + conditions_ = conditions + elif isinstance(conditions, Iterable): + conditions_ = np.array(list(conditions)) + else: + raise NotImplementedError("type `%s` is not supported" % (type(conditions))) + return Delta(conditions=conditions_) + + return experimentalist