microsoft · thinkall · Nov 1, 2024 · Oct 24, 2024 · Oct 24, 2024 · Oct 24, 2024
diff --git a/flaml/automl/model.py b/flaml/automl/model.py
@@ -1567,13 +1567,6 @@ def fit(self, X_train, y_train, budget=None, free_mem_ratio=0, **kwargs):
                 # for xgboost>=1.6.0, pop callbacks to enable pickle
                 callbacks = self.params.pop("callbacks")
                 self._model.set_params(callbacks=callbacks[:-1])
-            best_iteration = (
-                getattr(self._model.get_booster(), "best_iteration", None)
-                if isinstance(self, XGBoostSklearnEstimator)
-                else self._model.best_iteration_
-            )
-            if best_iteration is not None:
-                self._model.set_params(n_estimators=best_iteration + 1)
         else:
             self._fit(X_train, y_train, **kwargs)
         train_time = time.time() - start_time
@@ -1588,8 +1581,6 @@ def _callback(self, start_time, deadline, free_mem_ratio, env) -> None:
         now = time.time()
         if env.iteration == 0:
             self._time_per_iter = now - start_time
-        if now + self._time_per_iter > deadline:
-            raise EarlyStopException(env.iteration, env.evaluation_result_list)
         if psutil is not None:
             mem = psutil.virtual_memory()
             if mem.available / mem.total < free_mem_ratio:

diff --git a/test/automl/test_classification.py b/test/automl/test_classification.py
@@ -492,7 +492,7 @@ def test_reproducibility_of_classification_models(estimator: str):
         "extra_tree",
         "histgb",
         "kneighbor",
-        # "lgbm",
+        "lgbm",
         # "lrl1",
         "lrl2",
         "rf",

diff --git a/test/automl/test_regression.py b/test/automl/test_regression.py
@@ -338,14 +338,60 @@ def test_reproducibility_of_catboost_regression_model():
     assert pytest.approx(val_loss_flaml) == reproduced_val_loss
 
 
+def test_reproducibility_of_lgbm_regression_model():
+    """FLAML finds the best model for a given dataset, which it then provides to users.
+
+    However, there are reported issues around LGBMs - see here:
+    https://github.com/microsoft/FLAML/issues/1368
+    In this test we take the best LGB regression model which FLAML provided us, and then retrain and test it on the
+    same folds, to verify that the result is reproducible.
+    """
+    automl = AutoML()
+    automl_settings = {
+        "time_budget": 3,
+        "task": "regression",
+        "n_jobs": 1,
+        "estimator_list": ["lgbm"],
+        "eval_method": "cv",
+        "n_splits": 9,
+        "metric": "r2",
+        "keep_search_state": True,
+        "skip_transform": True,
+        "retrain_full": True,
+    }
+    X, y = fetch_california_housing(return_X_y=True, as_frame=True)
+    automl.fit(X_train=X, y_train=y, **automl_settings)
+    best_model = automl.model
+    assert best_model is not None
+    config = best_model.get_params()
+    val_loss_flaml = automl.best_result["val_loss"]
+
+    # Take the best model, and see if we can reproduce the best result
+    reproduced_val_loss, metric_for_logging, train_time, pred_time = automl._state.task.evaluate_model_CV(
+        config=config,
+        estimator=best_model,
+        X_train_all=automl._state.X_train_all,
+        y_train_all=automl._state.y_train_all,
+        budget=None,
+        kf=automl._state.kf,
+        eval_metric="r2",
+        best_val_loss=None,
+        cv_score_agg_func=None,
+        log_training_metric=False,
+        fit_kwargs=None,
+        free_mem_ratio=0,
+    )
+    assert pytest.approx(val_loss_flaml) == reproduced_val_loss
+
+
 @pytest.mark.parametrize(
     "estimator",
     [
         "catboost",
         "extra_tree",
         "histgb",
         "kneighbor",
-        # "lgbm",
+        "lgbm",
         "rf",
         "xgboost",
         "xgb_limitdepth",
@@ -374,6 +420,7 @@ def test_reproducibility_of_underlying_regression_models(estimator: str):
         "metric": "r2",
         "keep_search_state": True,
         "skip_transform": True,
+        "retrain_full": False,
     }
     X, y = fetch_california_housing(return_X_y=True, as_frame=True)
     automl.fit(X_train=X, y_train=y, **automl_settings)