Refactor tranquilo code (#395)

optimagic-dev · Nov 4, 2022 · e0b5b6c · e0b5b6c
1 parent 4b9019f
commit e0b5b6c
Show file tree

Hide file tree

Showing 10 changed files with 410 additions and 180 deletions.
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -6,7 +6,7 @@ repos:
     -   id: debug-statements
     -   id: end-of-file-fixer
 -   repo: https://github.com/asottile/reorder_python_imports
-    rev: v3.8.5
+    rev: v3.9.0
     hooks:
     -   id: reorder-python-imports
         types: [python]

diff --git a/src/estimagic/optimization/tranquilo/aggregate_models.py b/src/estimagic/optimization/tranquilo/aggregate_models.py
@@ -9,18 +9,14 @@ def get_aggregator(aggregator, functype, model_info):
 
     Args:
         aggregator (str or callable): Name of an aggregator or aggregator function.
-            The function must take as arguments (in that order):
+            The function must take as argument:
             - vector_model (VectorModel): A fitted vector model.
-            - fvec_center (np.ndarray): A 1d array of the residuals at the center of the
-            trust-region. In the noisy case, this may be an average.
-            - model_info (ModelInfo): The model information.
         functype (str): One of "scalar", "least_squares" and "likelihood".
         model_info (ModelInfo): Information that describes the functional form of
             the model.
 
     Returns:
-        callable: The partialled aggregator that only depends on vector_model and
-            fvec_center.
+        callable: The partialled aggregator that only depends on vector_model.
 
     """
     built_in_aggregators = {
@@ -46,7 +42,7 @@ def get_aggregator(aggregator, functype, model_info):
 
     # determine if aggregator is compatible with functype and model_info
     aggregator_compatible_with_functype = {
-        "scalar": ("identity", "identity_old", "sum"),
+        "scalar": ("identity", "sum"),
         "least_squares": ("least_squares_linear",),
         "likelihood": (
             "sum",
@@ -83,45 +79,29 @@ def get_aggregator(aggregator, functype, model_info):
             )
 
     # create aggregator
-    out = partial(
-        _aggregate_models_template, aggregator=_aggregator, model_info=model_info
-    )
+    out = partial(_aggregate_models_template, aggregator=_aggregator)
     return out
 
 
-def _aggregate_models_template(vector_model, fvec_center, aggregator, model_info):
+def _aggregate_models_template(vector_model, aggregator):
     """Aggregate a VectorModel into a ScalarModel.
 
-    Note on fvec_center:
-    --------------------
-    Let x0 be the x-value at which the x-sample is centered. If there is little noise
-    and the criterion function f is evaluated at x0, then fvec_center = f(x0). If,
-    however, the criterion function is very noisy or only evaluated in a neighborhood
-    around x0, then fvec_center is constructed as an average over evaluations of f
-    with x close to x0.
-
     Args:
         vector_model (VectorModel): The VectorModel to aggregate.
-        fvec_center (np.ndarray): A 1d array of the residuals at the center of the
-            trust-region. In the noisy case, this may be an average.
         aggregator (callable): The function that does the actual aggregation.
-        model_info (ModelInfo): Information that describes the functional form of
-            the model.
 
     Returns:
         ScalarModel: The aggregated model
 
     """
-    intercept, linear_terms, square_terms = aggregator(
-        vector_model, fvec_center, model_info
-    )
+    intercept, linear_terms, square_terms = aggregator(vector_model)
     scalar_model = ScalarModel(
         intercept=intercept, linear_terms=linear_terms, square_terms=square_terms
     )
     return scalar_model
 
 
-def aggregator_identity(vector_model, fvec_center, model_info):
+def aggregator_identity(vector_model):
     """Aggregate quadratic VectorModel using identity function.
 
     This aggregation is useful if the underlying maximization problem is a scalar
@@ -135,14 +115,14 @@ def aggregator_identity(vector_model, fvec_center, model_info):
     """
     intercept = float(vector_model.intercepts)
     linear_terms = np.squeeze(vector_model.linear_terms)
-    if model_info.has_squares or model_info.has_interactions:
-        square_terms = np.squeeze(vector_model.square_terms)
-    else:
+    if vector_model.square_terms is None:
         square_terms = np.zeros((len(linear_terms), len(linear_terms)))
+    else:
+        square_terms = np.squeeze(vector_model.square_terms)
     return intercept, linear_terms, square_terms
 
 
-def aggregator_sum(vector_model, fvec_center, model_info):
+def aggregator_sum(vector_model):
     """Aggregate quadratic VectorModel using sum function.
 
     This aggregation is useful if the underlying maximization problem is a likelihood
@@ -163,7 +143,7 @@ def aggregator_sum(vector_model, fvec_center, model_info):
     return intercept, linear_terms, square_terms
 
 
-def aggregator_least_squares_linear(vector_model, fvec_center, model_info):
+def aggregator_least_squares_linear(vector_model):
     """Aggregate linear VectorModel assuming a least_squares functype.
 
     This aggregation is useful if the underlying maximization problem is a least-squares
@@ -190,7 +170,7 @@ def aggregator_least_squares_linear(vector_model, fvec_center, model_info):
     return intercept, linear_terms, square_terms
 
 
-def aggregator_information_equality_linear(vector_model, fvec_center, model_info):
+def aggregator_information_equality_linear(vector_model):
     """Aggregate linear VectorModel using the Fisher information equality.
 
     This aggregation is useful if the underlying maximization problem is a likelihood

diff --git a/src/estimagic/optimization/tranquilo/filter_points.py b/src/estimagic/optimization/tranquilo/filter_points.py
@@ -1,4 +1,5 @@
 import numpy as np
+from estimagic.optimization.tranquilo.models import n_second_order_terms
 from numba import njit
 from scipy.linalg import qr_multiply
 
@@ -62,7 +63,7 @@ def drop_collinear_pounders(xs, indices, state):
 def _drop_collinear_pounders(xs, indices, state):
     theta2 = 1e-4
     n_samples, n_params = xs.shape
-    n_poly_terms = n_params * (n_params + 1) // 2
+    n_poly_terms = n_second_order_terms(n_params)
 
     indices_reverse = indices[::-1]
     indexer_reverse = np.arange(n_samples)[::-1]
@@ -97,7 +98,9 @@ def _drop_collinear_pounders(xs, indices, state):
             continue
 
         linear_features[counter, 1:] = centered_xs[index]
-        square_features[counter, :] = _square_features(linear_features[counter, 1:])
+        square_features[counter, :] = _scaled_square_features(
+            linear_features[counter, 1:]
+        )
 
         linear_features_pad = np.zeros((n_samples, n_samples))
         linear_features_pad[:n_samples, : n_params + 1] = linear_features
@@ -127,15 +130,15 @@ def _get_polynomial_feature_matrices(
     square_features = np.zeros((n_samples, n_poly_terms))
 
     linear_features[0, 1:] = centered_xs[indexer[index_center]]
-    square_features[0, :] = _square_features(linear_features[0, 1:]).flatten()
+    square_features[0, :] = _scaled_square_features(linear_features[0, 1:]).flatten()
 
     _is_center_in_head = index_center < n_params
     idx_list_n = [i for i in range(n_params + _is_center_in_head) if i != index_center]
     idx_list_n_plus_1 = [index_center] + idx_list_n
 
     linear_features[:, 0] = 1
     linear_features[: n_params + 1, 1:] = centered_xs[indexer[idx_list_n_plus_1]]
-    square_features[: n_params + 1, :] = _square_features(
+    square_features[: n_params + 1, :] = _scaled_square_features(
         linear_features[: n_params + 1, 1:]
     )
 
@@ -145,16 +148,29 @@ def _get_polynomial_feature_matrices(
 
 
 @njit
-def _square_features(x):
+def _scaled_square_features(x):
+    """Construct scaled interaction and square terms.
+
+    The interaction terms are scaled by 1 / sqrt{2} while the square terms are scaled
+    by 1 / 2.
+
+    Args:
+        x (np.ndarray): Array of shape (n_samples, n_params).
+
+    Returns:
+        np.ndarray: Scaled interaction and square terms. Has shape (n_samples,
+            n_params + (n_params - 1) * n_params / 1).
+
+    """
     n_samples, n_params = np.atleast_2d(x).shape
-    n_poly_terms = n_params * (n_params + 1) // 2
+    n_poly_terms = n_second_order_terms(n_params)
 
-    poly_terms = np.empty((n_poly_terms, n_samples), x.dtype)
+    poly_terms = np.empty((n_poly_terms, n_samples), np.float64)
     xt = x.T
 
     idx = 0
     for i in range(n_params):
-        poly_terms[idx] = 0.5 * xt[i] ** 2
+        poly_terms[idx] = xt[i] ** 2 / 2
         idx += 1
 
         for j in range(i + 1, n_params):

diff --git a/src/estimagic/optimization/tranquilo/fit_models.py b/src/estimagic/optimization/tranquilo/fit_models.py
@@ -4,6 +4,8 @@
 
 import numpy as np
 from estimagic.optimization.tranquilo.models import ModelInfo
+from estimagic.optimization.tranquilo.models import n_interactions
+from estimagic.optimization.tranquilo.models import n_second_order_terms
 from estimagic.optimization.tranquilo.models import VectorModel
 from numba import njit
 from scipy.linalg import qr_multiply
@@ -369,16 +371,15 @@ def _get_current_fit_minimal_frobenius_norm_of_hessian(
 
     for k in range(n_residuals):
         z_y_vec = np.dot(z_mat.T, y[:, k])
-        coeffs_first_stage = np.linalg.solve(
-            np.atleast_2d(n_z_mat_square),
-            np.atleast_1d(z_y_vec),
+        coeffs_first_stage, *_ = np.linalg.lstsq(
+            np.atleast_2d(n_z_mat_square), np.atleast_1d(z_y_vec), rcond=None
         )
 
         coeffs_second_stage = np.atleast_2d(n_z_mat) @ coeffs_first_stage
 
         rhs = y[:, k] - n_mat @ coeffs_second_stage
 
-        alpha = np.linalg.solve(m_mat, rhs[: n_params + 1])
+        alpha, *_ = np.linalg.lstsq(m_mat, rhs[: n_params + 1], rcond=None)
         coeffs_linear[k, :] = alpha[offset : (n_params + 1)]
 
         coeffs_square[k] = coeffs_second_stage
@@ -442,11 +443,11 @@ def _polynomial_features(x, has_squares):
     n_samples, n_params = x.shape
 
     if has_squares:
-        n_poly_terms = n_params * (n_params + 1) // 2
+        n_poly_terms = n_second_order_terms(n_params)
     else:
-        n_poly_terms = n_params * (n_params - 1) // 2
+        n_poly_terms = n_interactions(n_params)
 
-    poly_terms = np.empty((n_poly_terms, n_samples), x.dtype)
+    poly_terms = np.empty((n_poly_terms, n_samples), np.float64)
     xt = x.T
 
     idx = 0

diff --git a/src/estimagic/optimization/tranquilo/models.py b/src/estimagic/optimization/tranquilo/models.py
@@ -2,6 +2,7 @@
 from typing import Union
 
 import numpy as np
+from numba import njit
 
 
 class VectorModel(NamedTuple):
@@ -53,3 +54,52 @@ def evaluate_model(scalar_model, centered_x):
         y += x.T @ scalar_model.square_terms @ x / 2
 
     return y
+
+
+def n_free_params(dim, info_or_name):
+    """Number of free parameters in a model specified by name or model_info."""
+    out = dim + 1
+    if isinstance(info_or_name, ModelInfo):
+        info = info_or_name
+        if info.has_squares:
+            out += dim
+        if info.has_interactions:
+            out += n_interactions(dim)
+    elif isinstance(info_or_name, str) and info_or_name in (
+        "linear",
+        "quadratic",
+        "diagonal",
+    ):
+        name = info_or_name
+        if name == "quadratic":
+            out += n_second_order_terms(dim)
+        elif name == "diagonal":
+            out += dim
+    else:
+        raise ValueError()
+    return out
+
+
+@njit
+def n_second_order_terms(dim):
+    """Number of free second order terms in a quadratic model."""
+    return dim * (dim + 1) // 2
+
+
+@njit
+def n_interactions(dim):
+    """Number of free interaction terms in a quadratic model."""
+    return dim * (dim - 1) // 2
+
+
+def is_second_order_model(model_or_info):
+    """Check if a model has any second order terms."""
+    if isinstance(model_or_info, ModelInfo):
+        model_info = model_or_info
+        out = model_info.has_interactions or model_info.has_squares
+    elif isinstance(model_or_info, (ScalarModel, VectorModel)):
+        model = model_or_info
+        out = model.square_terms is not None
+    else:
+        raise TypeError()
+    return out
diff --git a/src/estimagic/optimization/tranquilo/tranquilo.py b/src/estimagic/optimization/tranquilo/tranquilo.py
@@ -10,6 +10,7 @@
 from estimagic.optimization.tranquilo.filter_points import get_sample_filter
 from estimagic.optimization.tranquilo.fit_models import get_fitter
 from estimagic.optimization.tranquilo.models import ModelInfo
+from estimagic.optimization.tranquilo.models import n_free_params
 from estimagic.optimization.tranquilo.models import ScalarModel
 from estimagic.optimization.tranquilo.options import Bounds
 from estimagic.optimization.tranquilo.options import ConvOptions
@@ -220,7 +221,6 @@ def _tranquilo(
 
         scalar_model = aggregate_vector_model(
             vector_model=vector_model,
-            fvec_center=state.fvec,
         )
 
         sub_sol = solve_subproblem(model=scalar_model, trustregion=trustregion)
@@ -292,7 +292,6 @@ def _calculate_rho(actual_improvement, expected_improvement):
         rho = -np.inf
     else:
         rho = actual_improvement / expected_improvement
-
     return rho
 
 
@@ -349,10 +348,10 @@ def _process_surrogate_model(surrogate_model, functype):
     elif isinstance(surrogate_model, str):
         if surrogate_model == "linear":
             out = ModelInfo(has_squares=False, has_interactions=False)
-        elif surrogate_model == "quadratic":
-            out = ModelInfo(has_squares=True, has_interactions=True)
         elif surrogate_model == "diagonal":
             out = ModelInfo(has_squares=True, has_interactions=False)
+        elif surrogate_model == "quadratic":
+            out = ModelInfo(has_squares=True, has_interactions=True)
         else:
             raise ValueError(f"Invalid surrogate model: {surrogate_model}")
 
@@ -371,11 +370,11 @@ def _process_sample_size(user_sample_size, model_info, x):
     elif isinstance(user_sample_size, str):
         user_sample_size = user_sample_size.replace(" ", "")
         if user_sample_size in ["linear", "n+1"]:
-            out = len(x) + 1
+            out = n_free_params(dim=len(x), info_or_name="linear")
         elif user_sample_size in ["powell", "2n+1", "2*n+1"]:
             out = 2 * len(x) + 1
         elif user_sample_size == "quadratic":
-            out = len(x) * (len(x) + 1) // 2 + len(x) + 1
+            out = n_free_params(dim=len(x), info_or_name="quadratic")
         else:
             raise ValueError(f"Invalid sample size: {user_sample_size}")