From 2dcfe517775c4cef03e0c6fc20f40eba44d8f3a9 Mon Sep 17 00:00:00 2001 From: Christian Zimpelmann Date: Mon, 10 Apr 2023 11:10:56 +0200 Subject: [PATCH] Improve estimation table (#379) --- ..._generate_publication_quality_tables.ipynb | 177 ++++++++++++++++-- src/estimagic/differentiation/derivatives.py | 4 +- .../internal_criterion_template.py | 2 +- src/estimagic/optimization/optimize.py | 4 +- .../visualization/estimation_table.py | 128 +++++++------ tests/visualization/test_estimation_table.py | 38 +++- 6 files changed, 263 insertions(+), 90 deletions(-) diff --git a/docs/source/how_to_guides/miscellaneous/how_to_generate_publication_quality_tables.ipynb b/docs/source/how_to_guides/miscellaneous/how_to_generate_publication_quality_tables.ipynb index d41e691aa..a08e7fb74 100644 --- a/docs/source/how_to_guides/miscellaneous/how_to_generate_publication_quality_tables.ipynb +++ b/docs/source/how_to_guides/miscellaneous/how_to_generate_publication_quality_tables.ipynb @@ -8,20 +8,22 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "Estimagic can create publication quality tables of parameter estimates in LaTeX or HTML. It works with the results from `estimate_ml` and `estimate_msm` but also supports statsmodels results out of the box. \n", "\n", - "You can get almost limitless flexibility if you split the table generation into two steps. The fist generates a DataFrame which you can customize to your liking, the second renders that DataFrame in LaTeX or HTML." + "You can get almost limitless flexibility if you split the table generation into two steps. The fist generates a DataFrame which you can customize to your liking, the second renders that DataFrame in LaTeX or HTML. If you are interested in this feature, search for \"render_inputs\" below." ] }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 24, "metadata": {}, "outputs": [], "source": [ + "# Make necessary imports\n", "import estimagic as em\n", "import pandas as pd\n", "import statsmodels.formula.api as sm\n", @@ -38,7 +40,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 25, "metadata": {}, "outputs": [], "source": [ @@ -50,7 +52,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 26, "metadata": {}, "outputs": [ { @@ -156,7 +158,7 @@ "" ] }, - "execution_count": 3, + "execution_count": 26, "metadata": {}, "output_type": "execute_result" } @@ -181,7 +183,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 27, "metadata": {}, "outputs": [ { @@ -240,7 +242,7 @@ "Sex -33.789 1.61800 1.000000e-08" ] }, - "execution_count": 4, + "execution_count": 27, "metadata": {}, "output_type": "execute_result" } @@ -266,7 +268,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 28, "metadata": {}, "outputs": [], "source": [ @@ -276,7 +278,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 29, "metadata": {}, "outputs": [ { @@ -398,7 +400,7 @@ "" ] }, - "execution_count": 6, + "execution_count": 29, "metadata": {}, "output_type": "execute_result" } @@ -436,7 +438,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 30, "metadata": {}, "outputs": [ { @@ -514,7 +516,7 @@ "F Statistic 8.06$^{***}$ 72.90$^{***}$ " ] }, - "execution_count": 7, + "execution_count": 30, "metadata": {}, "output_type": "execute_result" } @@ -534,7 +536,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 31, "metadata": {}, "outputs": [ { @@ -638,7 +640,7 @@ "" ] }, - "execution_count": 8, + "execution_count": 31, "metadata": {}, "output_type": "execute_result" } @@ -648,6 +650,140 @@ "HTML(em.render_html(**render_inputs))" ] }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Using this 2-step-procedure, we can also easily add additional rows to the footer.\n", + "\n", + "Note that we add the row using `.loc[(\"Statsmodels\", )]` since the index of `render_inputs[\"footer\"]` is a MultiIndex.\n", + "\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
 target
 (1)(2)(3)
Intercept152.00$^{*** }$152.00$^{*** }$142.00$^{*** }$
(3.61)(2.85)(3.14)
Age301.00$^{*** }$37.20$^{ }$51.50$^{*** }$
(77.10)(64.10)(2.72)
Sex17.40$^{ }$-107.00$^{* }$-33.80$^{*** }$
(77.10)(62.10)(1.62)
BMI787.00$^{*** }$
(65.40)
ABP417.00$^{*** }$
(69.50)
\n", + "
R$^2$0.040.40
Observations442442445
StatsmodelsYesYesNo
\n", + "
Note:***p<0.01; **p<0.05; *p<0.1
" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 32, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "render_inputs[\"footer\"].loc[(\"Statsmodels\",)] = [\"Yes\"] * 2 + [\"No\"]\n", + "HTML(em.render_html(**render_inputs))" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -659,7 +795,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 33, "metadata": {}, "outputs": [], "source": [ @@ -675,7 +811,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 34, "metadata": {}, "outputs": [ { @@ -798,7 +934,7 @@ "" ] }, - "execution_count": 10, + "execution_count": 34, "metadata": {}, "output_type": "execute_result" } @@ -937,6 +1073,11 @@ "\\end{tabular}\n", "```" ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [] } ], "metadata": { @@ -962,7 +1103,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.8" + "version": "3.10.10" } }, "nbformat": 4, diff --git a/src/estimagic/differentiation/derivatives.py b/src/estimagic/differentiation/derivatives.py index b9acc3863..500be4505 100644 --- a/src/estimagic/differentiation/derivatives.py +++ b/src/estimagic/differentiation/derivatives.py @@ -92,7 +92,7 @@ def first_derivative( error_handling (str): One of "continue" (catch errors and continue to calculate derivative estimates. In this case, some derivative estimates can be missing but no errors are raised), "raise" (catch errors and continue - to calculate derivative estimates at fist but raise an error if all + to calculate derivative estimates at first but raise an error if all evaluations for one parameter failed) and "raise_strict" (raise an error as soon as a function evaluation fails). batch_evaluator (str or callable): Name of a pre-implemented batch evaluator @@ -360,7 +360,7 @@ def second_derivative( error_handling (str): One of "continue" (catch errors and continue to calculate derivative estimates. In this case, some derivative estimates can be missing but no errors are raised), "raise" (catch errors and continue - to calculate derivative estimates at fist but raise an error if all + to calculate derivative estimates at first but raise an error if all evaluations for one parameter failed) and "raise_strict" (raise an error as soon as a function evaluation fails). batch_evaluator (str or callable): Name of a pre-implemented batch evaluator diff --git a/src/estimagic/optimization/internal_criterion_template.py b/src/estimagic/optimization/internal_criterion_template.py index fa34bf4ae..54fd81880 100644 --- a/src/estimagic/optimization/internal_criterion_template.py +++ b/src/estimagic/optimization/internal_criterion_template.py @@ -61,7 +61,7 @@ def internal_criterion_and_derivative_template( derivative. criterion_and_derivative (callable): Function that returns criterion and derivative as a tuple. This can be used to exploit synergies in the - evaluation of both functions. The fist element of the tuple has to be + evaluation of both functions. The first element of the tuple has to be exactly the same as the output of criterion. The second has to be exactly the same as the output of derivative. numdiff_options (dict): Keyword arguments for the calculation of numerical diff --git a/src/estimagic/optimization/optimize.py b/src/estimagic/optimization/optimize.py index 6bde57eb6..3f8119c01 100644 --- a/src/estimagic/optimization/optimize.py +++ b/src/estimagic/optimization/optimize.py @@ -99,7 +99,7 @@ def maximize( derivative_kwargs (dict): Additional keyword arguments for derivative. criterion_and_derivative (callable): Function that returns criterion and derivative as a tuple. This can be used to exploit synergies in the - evaluation of both functions. The fist element of the tuple has to be + evaluation of both functions. The first element of the tuple has to be exactly the same as the output of criterion. The second has to be exactly the same as the output of derivative. criterion_and_derivative_kwargs (dict): Additional keyword arguments for @@ -300,7 +300,7 @@ def minimize( derivative_kwargs (dict): Additional keyword arguments for derivative. criterion_and_derivative (callable): Function that returns criterion and derivative as a tuple. This can be used to exploit synergies in the - evaluation of both functions. The fist element of the tuple has to be + evaluation of both functions. The first element of the tuple has to be exactly the same as the output of criterion. The second has to be exactly the same as the output of derivative. criterion_and_derivative_kwargs (dict): Additional keyword arguments for diff --git a/src/estimagic/visualization/estimation_table.py b/src/estimagic/visualization/estimation_table.py index 16a600467..44c1a5559 100644 --- a/src/estimagic/visualization/estimation_table.py +++ b/src/estimagic/visualization/estimation_table.py @@ -8,6 +8,11 @@ import pandas as pd +suppress_performance_warnings = np.testing.suppress_warnings() +suppress_performance_warnings.filter(category=pd.errors.PerformanceWarning) + + +@suppress_performance_warnings def estimation_table( models, *, @@ -107,7 +112,7 @@ def estimation_table( confidence_intervals (bool): If True, display confidence intervals as inference values. Display standard errors otherwise. Default False. significance_levels (list): a list of floats for p value's significance cut-off - values.This is used to generate the significance stars. Default is + values. This is used to generate the significance stars. Default is [0.1,0.05,0.01]. append_notes (bool): A boolean variable for printing p value cutoff explanation and additional notes, if applicable. Default is True. @@ -121,7 +126,7 @@ def estimation_table( used as row names in the table. number_format (int, str, iterable or callable): A callable, iterable, integer or string that is used to apply string formatter(s) to floats in the - table. Defualt ("{0:.3g}", "{0:.5f}", "{0:.4g}"). + table. Default ("{0:.3g}", "{0:.5f}", "{0:.4g}"). add_trailing_zeros (bool): If True, format floats such that they have same number of digits after the decimal point. Default True. siunitx_warning (bool): If True, print warning about LaTex preamble to add for @@ -261,7 +266,7 @@ def render_latex( significance_levels (list or tuple): a list of floats for p value's significance cutt-off values. Default is [0.1,0.05,0.01]. custom_notes (list): A list of strings for additional notes. Default is None. - siunitx_watning (bool): If True, print warning about LaTex preamble to add for + siunitx_warning (bool): If True, print warning about LaTex preamble to add for proper compilation of when working with siunitx package. Default True. show_index_names (bool): If True, display index names in the table. show_col_names (bool): If True, the column names are displayed. @@ -348,7 +353,7 @@ def render_latex( latex_str = latex_str.split("\\bottomrule")[0] if show_footer: footer = footer.copy(deep=True) - footer = footer.apply(_center_align_integers, axis=1) + footer = footer.apply(_center_align_integers_and_non_numeric_strings, axis=1) footer_styler = footer.style stats_str = footer_styler.to_latex(**default_options) if "\\midrule" in stats_str: @@ -543,7 +548,7 @@ def _get_estimation_table_body_and_footer( number_format (int, str, iterable or callable): A callable, iterable, integer or callable that is used to apply string formatter(s) to floats in the table. - add_trailing_zeros (bool): If True, format floats such that they haave same + add_trailing_zeros (bool): If True, format floats such that they have same number of digits after the decimal point. Returns: @@ -623,7 +628,7 @@ def _build_estimation_table_body( number_format (int, str, iterable or callable): A callable, iterable, integer or callable that is used to apply string formatter(s) to floats in the table. - add_trailing_zeros (bool): If True, format floats such that they haave same + add_trailing_zeros (bool): If True, format floats such that they have same number of digits after the decimal point. Returns: @@ -718,7 +723,6 @@ def _build_estimation_table_footer( for mod in models ] stats = pd.concat(to_concat, axis=1) - stats = stats.apply(_unformat_integers, axis=1) return stats @@ -768,10 +772,17 @@ def _get_cols_to_format(show_inference, confidence_intervals): def _apply_number_formatting_frames(dfs, columns, number_format, add_trailing_zeros): """Apply string formatter to specific columns of a list of DataFrames.""" - raw_formatted = [_apply_number_format(df[columns], number_format) for df in dfs] + + raw_formatted = [ + _apply_number_format(df[columns], number_format, format_integers=False) + for df in dfs + ] max_trail = int(max([_get_digits_after_decimal(df) for df in raw_formatted])) if add_trailing_zeros: - formatted = [_apply_number_format(df, max_trail) for df in raw_formatted] + formatted = [ + _apply_number_format(df, max_trail, format_integers=True) + for df in raw_formatted + ] else: formatted = raw_formatted return formatted, max_trail @@ -980,7 +991,7 @@ def _convert_frame_to_string_series( df (DataFrame): params DataFrame of the model significance_levels (list): see main docstring - number_format (int): see main docstring + number_format (int, str, iterable or callable): see main docstring show_inference (bool): see main docstring confidence_intervals (bool): see main docstring show_stars (bool): see main docstring @@ -1074,7 +1085,12 @@ def _create_statistics_sr( stats_options (dict): see main docstring significance_levels (list): see main docstring show_stars (bool): see main docstring - number_format (int): see main focstring + number_format (int, str, iterable or callable): see main docstring + add_trailing_zeros (bool): If True, format floats such that they haave same + number of digits after the decimal point. + max_trail (int): If add_trailing_zeros is True, add corresponding number of + trailing zeros to floats in the stats DataFrame to have number of digits + after a decimal point equal to max_trail for each float. Returns: series: string series with summary statistics values and additional info @@ -1089,11 +1105,14 @@ def _create_statistics_sr( show_dof = None for k in stats_options: stats_values[stats_options[k]] = model["info"].get(k, np.nan) + raw_formatted = _apply_number_format( - pd.DataFrame(pd.Series(stats_values)), number_format + pd.DataFrame(pd.Series(stats_values)), number_format, format_integers=False ) if add_trailing_zeros: - formatted = _apply_number_format(raw_formatted, max_trail) + formatted = _apply_number_format( + raw_formatted, max_trail, format_integers=False + ) else: formatted = raw_formatted stats_values = formatted.to_dict()[0] @@ -1122,7 +1141,7 @@ def _create_statistics_sr( stats_values["Residual Std. Error"], int(model["info"]["df_resid"]) ) stat_sr = pd.Series(stats_values) - # the follwing is to make sure statistics dataframe has as many levels of + # the following is to make sure statistics dataframe has as many levels of # indices as the parameters dataframe. stat_ind = np.empty((len(stat_sr), model["params"].index.nlevels - 1), dtype=str) stat_ind = np.concatenate( @@ -1344,32 +1363,41 @@ def _extract_info_from_sm(model): return info -def _apply_number_format(df, number_format): +def _apply_number_format(df_raw, number_format, format_integers): """Apply string format to DataFrame cells. Args: - df (DataFrame): The DataFrame with float values to format. - number_format(str, list, tuple, callable or int): User defined number format + df_raw (DataFrame): The DataFrame with float values to format. + number_format (str, list, tuple, callable or int): User defined number format to apply to the DataFrame. + format_integers (bool): Apply number format also to integers Returns: df_formatted (DataFrame): Formatted DataFrame. """ processed_format = _process_number_format(number_format) + df_raw = df_raw.copy(deep=True) if isinstance(processed_format, (list, tuple)): - df_formatted = df.copy(deep=True).astype("float") + df_formatted = df_raw.copy(deep=True).astype("float") for formatter in processed_format[:-1]: df_formatted = df_formatted.applymap(formatter.format).astype("float") df_formatted = df_formatted.astype("float").applymap( processed_format[-1].format ) elif isinstance(processed_format, str): - df_formatted = df.astype("str").applymap( + df_formatted = df_raw.astype("str").applymap( partial(_format_non_scientific_numbers, format_string=processed_format) ) elif callable(processed_format): - df_formatted = df.applymap(processed_format) + df_formatted = df_raw.applymap(processed_format) + + # Don't format integers: set to original value + if not format_integers: + integer_locs = df_raw.applymap(_is_integer) + df_formatted[integer_locs] = ( + df_raw[integer_locs].astype(float).applymap("{:.0f}".format) + ) return df_formatted @@ -1425,49 +1453,16 @@ def _get_digits_after_decimal(df): return max_trail -def _center_align_integers(sr): - """Align integer numbers at the center of model column. - - Args: - sr (pandas.Series): Series. - - Returns: - pandas.Series: Series with numbers aligned at the center. - - """ - sr = sr.copy() - for i in sr.index: - res_numeric = re.findall( - r"[-+]?[.]?[\d]+(?:,\d\d\d)*[\.]?\d*(?:[eE][-+]?\d+)?", sr[i] - ) - if res_numeric: - num = res_numeric[0] - char = sr[i].split(num)[1] - if int(float(num)) == float(num): - sr[i] = f"\\multicolumn{{1}}{{c}}{{{str(int(float(num)))+char}}}" - return sr - - -def _unformat_integers(sr): - """Remove trailing zeros from integer numbers. - - Args: - sr (pandas.Series): Series. - - Returns: - pandas.Series: Series with trailing zeros removed. - - """ - sr = sr.copy() +def _center_align_integers_and_non_numeric_strings(sr): + """Align integer numbers and strings at the center of model column.""" + sr = deepcopy(sr) for i in sr.index: - res_numeric = re.findall( - r"[-+]?[.]?[\d]+(?:,\d\d\d)*[\.]?\d*(?:[eE][-+]?\d+)?", sr[i] - ) - if res_numeric: - num = res_numeric[0] - char = sr[i].split(num)[1] - if int(float(num)) == float(num): - sr[i] = str(int(float(num))) + char + if _is_integer(sr[i]): + sr[i] = f"\\multicolumn{{1}}{{c}}{{{str(int(float(sr[i])))}}}" + else: + string_without_stars = sr[i].split("$", 1)[0] + if not string_without_stars.replace(".", "").isnumeric(): + sr[i] = f"\\multicolumn{{1}}{{c}}{{{sr[i]}}}" return sr @@ -1485,3 +1480,12 @@ def _get_updated_styler( for ax in [0, 1]: styler = styler.format_index(escape=escape_special_characters, axis=ax) return styler + + +def _is_integer(num): + """Check if number is an integer (including a float with only zeros as digits)""" + try: + out = int(float(num)) == float(num) + except ValueError: + out = False + return out diff --git a/tests/visualization/test_estimation_table.py b/tests/visualization/test_estimation_table.py index 038abbdc9..eb28da4e6 100644 --- a/tests/visualization/test_estimation_table.py +++ b/tests/visualization/test_estimation_table.py @@ -23,6 +23,7 @@ estimation_table, render_html, render_latex, + _center_align_integers_and_non_numeric_strings, ) from pandas.testing import assert_frame_equal as afe from pandas.testing import assert_series_equal as ase @@ -269,7 +270,7 @@ def test_create_statistics_sr(): add_trailing_zeros, max_trail=4, ) - exp = pd.Series(["0.4500", "0.0002", "400.0000"]) + exp = pd.Series(["0.4500", "0.0002", "400"]) exp.index = pd.MultiIndex.from_arrays( np.array([np.array(["R2", "R2 Adj.", "Observations"]), np.array(["", "", ""])]) ) @@ -329,15 +330,19 @@ def test_apply_number_format_tuple(): number_format = ("{0:.2g}", "{0:.2f}", "{0:.2g}") raw = pd.DataFrame(data=[1234.2332, 0.0001]) exp = pd.DataFrame(data=["1.2e+03", "0"]) - res = _apply_number_format(df=raw, number_format=number_format) + res = _apply_number_format( + df_raw=raw, number_format=number_format, format_integers=False + ) afe(exp, res) def test_apply_number_format_int(): number_format = 3 raw = pd.DataFrame(data=["1234.2332", "1.2e+03"]) - exp = pd.DataFrame(data=["1234.233", "1.2e+03"]) - res = _apply_number_format(df=raw, number_format=number_format) + exp = pd.DataFrame(data=["1234.233", "1200"]) + res = _apply_number_format( + df_raw=raw, number_format=number_format, format_integers=False + ) afe(exp, res) @@ -349,7 +354,7 @@ def nsf(num, n=3): raw = pd.DataFrame(data=[1234.2332, 0.0001]) exp = pd.DataFrame(data=["1.23e+03", "1.00e-04"]) - res = _apply_number_format(df=raw, number_format=nsf) + res = _apply_number_format(df_raw=raw, number_format=nsf, format_integers=False) afe(exp, res) @@ -466,3 +471,26 @@ def test_check_order_of_model_names_raises_error(): model_names = ["a", "b", "a"] with pytest.raises(ValueError): _check_order_of_model_names(model_names) + + +def test_manual_extra_info(): + footer_str = """ + ,target + R$^2$,0.40 + Adj. R$^2$,0.40 + Residual Std. Error,60.5 + F Statistic,72.90$^{***}$ + Observations,442 + Controls,Yes + + """ + footer = _read_csv_string(footer_str).fillna("") + footer.set_index(" ", inplace=True) + footer.index.names = [None] + footer.index = pd.MultiIndex.from_arrays([footer.index]) + exp = footer.copy(deep=True) + exp.loc["Controls"] = "\\multicolumn{1}{c}{Yes}" + exp.loc["Observations"] = "\\multicolumn{1}{c}{442}" + for i, r in footer.iterrows(): + res = _center_align_integers_and_non_numeric_strings(r) + ase(exp.loc[i], res)