From 79c70076c05cd57243726d77e07213b60c11ca02 Mon Sep 17 00:00:00 2001 From: James Lamb Date: Sun, 25 Oct 2020 23:55:42 -0500 Subject: [PATCH 01/14] [ci] [python] reduce unnecessary data loading in tests --- tests/python_package_test/test_basic.py | 13 +++- tests/python_package_test/test_engine.py | 93 ++++++++++++++--------- tests/python_package_test/test_sklearn.py | 81 +++++++++++++------- 3 files changed, 118 insertions(+), 69 deletions(-) diff --git a/tests/python_package_test/test_basic.py b/tests/python_package_test/test_basic.py index a3d6b8eced3a..d68651d3b236 100644 --- a/tests/python_package_test/test_basic.py +++ b/tests/python_package_test/test_basic.py @@ -3,6 +3,8 @@ import tempfile import unittest +from functools import lru_cache + import lightgbm as lgb import numpy as np @@ -11,10 +13,15 @@ from sklearn.model_selection import train_test_split +@lru_cache +def _load_breast_cancer(**kwargs): + return load_breast_cancer(**kwargs) + + class TestBasic(unittest.TestCase): def test(self): - X_train, X_test, y_train, y_test = train_test_split(*load_breast_cancer(return_X_y=True), + X_train, X_test, y_train, y_test = train_test_split(*_load_breast_cancer(return_X_y=True), test_size=0.1, random_state=2) train_data = lgb.Dataset(X_train, label=y_train) valid_data = train_data.create_valid(X_test, label=y_test) @@ -86,7 +93,7 @@ def test(self): os.remove(tname) def test_chunked_dataset(self): - X_train, X_test, y_train, y_test = train_test_split(*load_breast_cancer(return_X_y=True), test_size=0.1, random_state=2) + X_train, X_test, y_train, y_test = train_test_split(*_load_breast_cancer(return_X_y=True), test_size=0.1, random_state=2) chunk_size = X_train.shape[0] // 10 + 1 X_train = [X_train[i * chunk_size:(i + 1) * chunk_size, :] for i in range(X_train.shape[0] // chunk_size + 1)] @@ -273,7 +280,7 @@ def check_asserts(data): self.assertAlmostEqual(data.label[1], data.weight[1]) self.assertListEqual(data.feature_name, data.get_feature_name()) - X, y = load_breast_cancer(return_X_y=True) + X, y = _load_breast_cancer(return_X_y=True) sequence = np.ones(y.shape[0]) sequence[0] = np.nan sequence[1] = np.inf diff --git a/tests/python_package_test/test_engine.py b/tests/python_package_test/test_engine.py index 0c90a6bada87..fa069e830d69 100644 --- a/tests/python_package_test/test_engine.py +++ b/tests/python_package_test/test_engine.py @@ -7,6 +7,8 @@ import random import unittest +from functools import lru_cache + import lightgbm as lgb import numpy as np from scipy.sparse import csr_matrix, isspmatrix_csr, isspmatrix_csc @@ -51,9 +53,26 @@ def categorize(continuous_x): return np.digitize(continuous_x, bins=np.arange(0, 1, 0.01)) +@lru_cache +def _load_boston(**kwargs): + return load_boston(**kwargs) + +@lru_cache +def _load_breast_cancer(**kwargs): + return load_breast_cancer(**kwargs) + +@lru_cache +def _load_digits(**kwargs): + return load_digits(**kwargs) + +@lru_cache +def _load_iris(**kwargs): + return load_iris(**kwargs) + + class TestEngine(unittest.TestCase): def test_binary(self): - X, y = load_breast_cancer(return_X_y=True) + X, y = _load_breast_cancer(return_X_y=True) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42) params = { 'objective': 'binary', @@ -75,7 +94,7 @@ def test_binary(self): self.assertAlmostEqual(evals_result['valid_0']['binary_logloss'][-1], ret, places=5) def test_rf(self): - X, y = load_breast_cancer(return_X_y=True) + X, y = _load_breast_cancer(return_X_y=True) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42) params = { 'boosting_type': 'rf', @@ -100,7 +119,7 @@ def test_rf(self): self.assertAlmostEqual(evals_result['valid_0']['binary_logloss'][-1], ret, places=5) def test_regression(self): - X, y = load_boston(return_X_y=True) + X, y = _load_boston(return_X_y=True) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42) params = { 'metric': 'l2', @@ -377,7 +396,7 @@ def test_categorical_non_zero_inputs(self): self.assertAlmostEqual(evals_result['valid_0']['auc'][-1], ret, places=5) def test_multiclass(self): - X, y = load_digits(n_class=10, return_X_y=True) + X, y = _load_digits(n_class=10, return_X_y=True) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42) params = { 'objective': 'multiclass', @@ -398,7 +417,7 @@ def test_multiclass(self): self.assertAlmostEqual(evals_result['valid_0']['multi_logloss'][-1], ret, places=5) def test_multiclass_rf(self): - X, y = load_digits(n_class=10, return_X_y=True) + X, y = _load_digits(n_class=10, return_X_y=True) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42) params = { 'boosting_type': 'rf', @@ -426,7 +445,7 @@ def test_multiclass_rf(self): self.assertAlmostEqual(evals_result['valid_0']['multi_logloss'][-1], ret, places=5) def test_multiclass_prediction_early_stopping(self): - X, y = load_digits(n_class=10, return_X_y=True) + X, y = _load_digits(n_class=10, return_X_y=True) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42) params = { 'objective': 'multiclass', @@ -452,7 +471,7 @@ def test_multiclass_prediction_early_stopping(self): self.assertLess(ret, 0.2) def test_multi_class_error(self): - X, y = load_digits(n_class=10, return_X_y=True) + X, y = _load_digits(n_class=10, return_X_y=True) params = {'objective': 'multiclass', 'num_classes': 10, 'metric': 'multi_error', 'num_leaves': 4, 'verbose': -1} lgb_data = lgb.Dataset(X, label=y) @@ -497,7 +516,7 @@ def test_multi_class_error(self): def test_auc_mu(self): # should give same result as binary auc for 2 classes - X, y = load_digits(n_class=10, return_X_y=True) + X, y = _load_digits(n_class=10, return_X_y=True) y_new = np.zeros((len(y))) y_new[y != 0] = 1 lgb_X = lgb.Dataset(X, label=y_new) @@ -575,7 +594,7 @@ def test_auc_mu(self): self.assertNotEqual(results_weight['training']['auc_mu'][-1], results_no_weight['training']['auc_mu'][-1]) def test_early_stopping(self): - X, y = load_breast_cancer(return_X_y=True) + X, y = _load_breast_cancer(return_X_y=True) params = { 'objective': 'binary', 'metric': 'binary_logloss', @@ -607,7 +626,7 @@ def test_early_stopping(self): self.assertIn('binary_logloss', gbm.best_score[valid_set_name]) def test_continue_train(self): - X, y = load_boston(return_X_y=True) + X, y = _load_boston(return_X_y=True) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42) params = { 'objective': 'regression', @@ -635,7 +654,7 @@ def test_continue_train(self): os.remove(model_name) def test_continue_train_reused_dataset(self): - X, y = load_boston(return_X_y=True) + X, y = _load_boston(return_X_y=True) params = { 'objective': 'regression', 'verbose': -1 @@ -648,7 +667,7 @@ def test_continue_train_reused_dataset(self): self.assertEqual(gbm.current_iteration(), 20) def test_continue_train_dart(self): - X, y = load_boston(return_X_y=True) + X, y = _load_boston(return_X_y=True) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42) params = { 'boosting_type': 'dart', @@ -671,7 +690,7 @@ def test_continue_train_dart(self): self.assertAlmostEqual(evals_result['valid_0']['l1'][-1], ret, places=5) def test_continue_train_multiclass(self): - X, y = load_iris(return_X_y=True) + X, y = _load_iris(return_X_y=True) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42) params = { 'objective': 'multiclass', @@ -694,7 +713,7 @@ def test_continue_train_multiclass(self): self.assertAlmostEqual(evals_result['valid_0']['multi_logloss'][-1], ret, places=5) def test_cv(self): - X_train, y_train = load_boston(return_X_y=True) + X_train, y_train = _load_boston(return_X_y=True) params = {'verbose': -1} lgb_train = lgb.Dataset(X_train, y_train) # shuffle = False, override metric in params @@ -753,7 +772,7 @@ def test_cv(self): np.testing.assert_allclose(cv_res_lambda['ndcg@3-mean'], cv_res_lambda_obj['ndcg@3-mean']) def test_cvbooster(self): - X, y = load_breast_cancer(return_X_y=True) + X, y = _load_breast_cancer(return_X_y=True) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42) params = { 'objective': 'binary', @@ -797,7 +816,7 @@ def test_cvbooster(self): self.assertLess(ret, 0.15) def test_feature_name(self): - X_train, y_train = load_boston(return_X_y=True) + X_train, y_train = _load_boston(return_X_y=True) params = {'verbose': -1} lgb_train = lgb.Dataset(X_train, y_train) feature_names = ['f_' + str(i) for i in range(X_train.shape[-1])] @@ -825,7 +844,7 @@ def test_feature_name_with_non_ascii(self): def test_save_load_copy_pickle(self): def train_and_predict(init_model=None, return_model=False): - X, y = load_boston(return_X_y=True) + X, y = _load_boston(return_X_y=True) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42) params = { 'objective': 'regression', @@ -989,7 +1008,7 @@ def test_reference_chain(self): self.assertEqual(len(evals_result['valid_1']['rmse']), 20) def test_contribs(self): - X, y = load_breast_cancer(return_X_y=True) + X, y = _load_breast_cancer(return_X_y=True) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42) params = { 'objective': 'binary', @@ -1372,7 +1391,7 @@ def test_small_max_bin(self): np.random.seed() # reset seed def test_refit(self): - X, y = load_breast_cancer(return_X_y=True) + X, y = _load_breast_cancer(return_X_y=True) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42) params = { 'objective': 'binary', @@ -1388,7 +1407,7 @@ def test_refit(self): self.assertGreater(err_pred, new_err_pred) def test_mape_rf(self): - X, y = load_boston(return_X_y=True) + X, y = _load_boston(return_X_y=True) params = { 'boosting_type': 'rf', 'objective': 'mape', @@ -1405,7 +1424,7 @@ def test_mape_rf(self): self.assertGreater(pred_mean, 20) def test_mape_dart(self): - X, y = load_boston(return_X_y=True) + X, y = _load_boston(return_X_y=True) params = { 'boosting_type': 'dart', 'objective': 'mape', @@ -1484,7 +1503,7 @@ def preprocess_data(dtrain, dtest, params): params['num_class'] = 4 return dtrain, dtest, params - X, y = load_iris(return_X_y=True) + X, y = _load_iris(return_X_y=True) dataset = lgb.Dataset(X, y, free_raw_data=False) params = {'objective': 'multiclass', 'num_class': 3, 'verbose': -1} results = lgb.cv(params, dataset, num_boost_round=10, fpreproc=preprocess_data) @@ -1492,7 +1511,7 @@ def preprocess_data(dtrain, dtest, params): self.assertEqual(len(results['multi_logloss-mean']), 10) def test_metrics(self): - X, y = load_digits(n_class=2, return_X_y=True) + X, y = _load_digits(n_class=2, return_X_y=True) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42) lgb_train = lgb.Dataset(X_train, y_train, silent=True) lgb_valid = lgb.Dataset(X_test, y_test, reference=lgb_train, silent=True) @@ -1798,7 +1817,7 @@ def train_booster(params=params_obj_verbose, **kwargs): self.assertEqual(len(evals_result), 1) self.assertIn('error', evals_result['valid_0']) - X, y = load_digits(n_class=3, return_X_y=True) + X, y = _load_digits(n_class=3, return_X_y=True) lgb_train = lgb.Dataset(X, y, silent=True) obj_multi_aliases = ['multiclass', 'softmax', 'multiclassova', 'multiclass_ova', 'ova', 'ovr'] @@ -1866,7 +1885,7 @@ def train_booster(params=params_obj_verbose, **kwargs): params_class_3_verbose, metrics='binary_error', fobj=dummy_obj) def test_multiple_feval_train(self): - X, y = load_breast_cancer(return_X_y=True) + X, y = _load_breast_cancer(return_X_y=True) params = {'verbose': -1, 'objective': 'binary', 'metric': 'binary_logloss'} @@ -1889,7 +1908,7 @@ def test_multiple_feval_train(self): self.assertIn('decreasing_metric', evals_result['valid_0']) def test_multiple_feval_cv(self): - X, y = load_breast_cancer(return_X_y=True) + X, y = _load_breast_cancer(return_X_y=True) params = {'verbose': -1, 'objective': 'binary', 'metric': 'binary_logloss'} @@ -1912,7 +1931,7 @@ def test_multiple_feval_cv(self): @unittest.skipIf(psutil.virtual_memory().available / 1024 / 1024 / 1024 < 3, 'not enough RAM') def test_model_size(self): - X, y = load_boston(return_X_y=True) + X, y = _load_boston(return_X_y=True) data = lgb.Dataset(X, y) bst = lgb.train({'verbose': -1}, data, num_boost_round=2) y_pred = bst.predict(X) @@ -1938,7 +1957,7 @@ def test_model_size(self): self.skipTest('not enough RAM') def test_get_split_value_histogram(self): - X, y = load_boston(return_X_y=True) + X, y = _load_boston(return_X_y=True) lgb_train = lgb.Dataset(X, y, categorical_feature=[2]) gbm = lgb.train({'verbose': -1}, lgb_train, num_boost_round=20) # test XGBoost-style return value @@ -2048,7 +2067,7 @@ def metrics_combination_cv_regression(metric_list, assumed_iteration, eval_train_metric=eval_train_metric) self.assertEqual(assumed_iteration, len(ret[list(ret.keys())[0]])) - X, y = load_boston(return_X_y=True) + X, y = _load_boston(return_X_y=True) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) X_test1, X_test2, y_test1, y_test2 = train_test_split(X_test, y_test, test_size=0.5, random_state=73) lgb_train = lgb.Dataset(X_train, y_train) @@ -2126,7 +2145,7 @@ def metrics_combination_cv_regression(metric_list, assumed_iteration, decreasing_metric(preds, train_data)]) def test_node_level_subcol(self): - X, y = load_breast_cancer(return_X_y=True) + X, y = _load_breast_cancer(return_X_y=True) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42) params = { 'objective': 'binary', @@ -2307,7 +2326,7 @@ def test_dataset_params_with_reference(self): def test_extra_trees(self): # check extra trees increases regularization - X, y = load_boston(return_X_y=True) + X, y = _load_boston(return_X_y=True) lgb_x = lgb.Dataset(X, label=y) params = {'objective': 'regression', 'num_leaves': 32, @@ -2325,7 +2344,7 @@ def test_extra_trees(self): def test_path_smoothing(self): # check path smoothing increases regularization - X, y = load_boston(return_X_y=True) + X, y = _load_boston(return_X_y=True) lgb_x = lgb.Dataset(X, label=y) params = {'objective': 'regression', 'num_leaves': 32, @@ -2347,7 +2366,7 @@ def _imptcs_to_numpy(X, impcts_dict): cols = ['Column_' + str(i) for i in range(X.shape[1])] return [impcts_dict.get(col, 0.) for col in cols] - X, y = load_breast_cancer(return_X_y=True) + X, y = _load_breast_cancer(return_X_y=True) data = lgb.Dataset(X, label=y) num_trees = 10 bst = lgb.train({"objective": "binary", "verbose": -1}, data, num_trees) @@ -2392,7 +2411,7 @@ def _imptcs_to_numpy(X, impcts_dict): self.assertIsNone(tree_df.loc[0, col]) def test_interaction_constraints(self): - X, y = load_boston(return_X_y=True) + X, y = _load_boston(return_X_y=True) num_features = X.shape[1] train_data = lgb.Dataset(X, label=y) # check that constraint containing all features is equivalent to no constraint @@ -2469,7 +2488,7 @@ def inner_test(X, y, params, early_stopping_rounds): np.testing.assert_allclose(pred4, pred6) # test for regression - X, y = load_boston(return_X_y=True) + X, y = _load_boston(return_X_y=True) params = { 'objective': 'regression', 'verbose': -1, @@ -2482,7 +2501,7 @@ def inner_test(X, y, params, early_stopping_rounds): inner_test(X, y, params, early_stopping_rounds=None) # test for multi-class - X, y = load_iris(return_X_y=True) + X, y = _load_iris(return_X_y=True) params = { 'objective': 'multiclass', 'metric': 'multi_logloss', @@ -2496,7 +2515,7 @@ def inner_test(X, y, params, early_stopping_rounds): inner_test(X, y, params, early_stopping_rounds=None) # test for binary - X, y = load_breast_cancer(return_X_y=True) + X, y = _load_breast_cancer(return_X_y=True) params = { 'objective': 'binary', 'metric': 'binary_logloss', diff --git a/tests/python_package_test/test_sklearn.py b/tests/python_package_test/test_sklearn.py index cb2f42c21ec6..eabf8b1e3f22 100644 --- a/tests/python_package_test/test_sklearn.py +++ b/tests/python_package_test/test_sklearn.py @@ -6,6 +6,8 @@ import unittest import warnings +from functools import lru_cache + import lightgbm as lgb import numpy as np from sklearn import __version__ as sk_version @@ -74,10 +76,31 @@ def multi_logloss(y_true, y_pred): return np.mean([-math.log(y_pred[i][y]) for i, y in enumerate(y_true)]) +@lru_cache +def _load_boston(**kwargs): + return load_boston(**kwargs) + +@lru_cache +def _load_breast_cancer(**kwargs): + return load_breast_cancer(**kwargs) + +@lru_cache +def _load_digits(**kwargs): + return load_digits(**kwargs) + +@lru_cache +def _load_iris(**kwargs): + return load_iris(**kwargs) + +@lru_cache +def _load_linnerud(**kwargs): + return load_linnerud(**kwargs) + + class TestSklearn(unittest.TestCase): def test_binary(self): - X, y = load_breast_cancer(return_X_y=True) + X, y = _load_breast_cancer(return_X_y=True) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42) gbm = lgb.LGBMClassifier(n_estimators=50, silent=True) gbm.fit(X_train, y_train, eval_set=[(X_test, y_test)], early_stopping_rounds=5, verbose=False) @@ -86,7 +109,7 @@ def test_binary(self): self.assertAlmostEqual(ret, gbm.evals_result_['valid_0']['binary_logloss'][gbm.best_iteration_ - 1], places=5) def test_regression(self): - X, y = load_boston(return_X_y=True) + X, y = _load_boston(return_X_y=True) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42) gbm = lgb.LGBMRegressor(n_estimators=50, silent=True) gbm.fit(X_train, y_train, eval_set=[(X_test, y_test)], early_stopping_rounds=5, verbose=False) @@ -95,7 +118,7 @@ def test_regression(self): self.assertAlmostEqual(ret, gbm.evals_result_['valid_0']['l2'][gbm.best_iteration_ - 1], places=5) def test_multiclass(self): - X, y = load_digits(n_class=10, return_X_y=True) + X, y = _load_digits(n_class=10, return_X_y=True) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42) gbm = lgb.LGBMClassifier(n_estimators=50, silent=True) gbm.fit(X_train, y_train, eval_set=[(X_test, y_test)], early_stopping_rounds=5, verbose=False) @@ -138,7 +161,7 @@ def test_xendcg(self): self.assertGreater(gbm.best_score_['valid_0']['ndcg@3'], 0.6253) def test_regression_with_custom_objective(self): - X, y = load_boston(return_X_y=True) + X, y = _load_boston(return_X_y=True) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42) gbm = lgb.LGBMRegressor(n_estimators=50, silent=True, objective=objective_ls) gbm.fit(X_train, y_train, eval_set=[(X_test, y_test)], early_stopping_rounds=5, verbose=False) @@ -147,7 +170,7 @@ def test_regression_with_custom_objective(self): self.assertAlmostEqual(ret, gbm.evals_result_['valid_0']['l2'][gbm.best_iteration_ - 1], places=5) def test_binary_classification_with_custom_objective(self): - X, y = load_digits(n_class=2, return_X_y=True) + X, y = _load_digits(n_class=2, return_X_y=True) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42) gbm = lgb.LGBMClassifier(n_estimators=50, silent=True, objective=logregobj) gbm.fit(X_train, y_train, eval_set=[(X_test, y_test)], early_stopping_rounds=5, verbose=False) @@ -159,7 +182,7 @@ def test_binary_classification_with_custom_objective(self): self.assertLess(ret, 0.05) def test_dart(self): - X, y = load_boston(return_X_y=True) + X, y = _load_boston(return_X_y=True) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42) gbm = lgb.LGBMRegressor(boosting_type='dart', n_estimators=50) gbm.fit(X_train, y_train) @@ -172,7 +195,7 @@ def test_dart(self): def test_stacking_classifier(self): from sklearn.ensemble import StackingClassifier - X, y = load_iris(return_X_y=True) + X, y = _load_iris(return_X_y=True) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42) classifiers = [('gbm1', lgb.LGBMClassifier(n_estimators=3)), ('gbm2', lgb.LGBMClassifier(n_estimators=3))] @@ -199,7 +222,7 @@ def test_stacking_classifier(self): def test_stacking_regressor(self): from sklearn.ensemble import StackingRegressor - X, y = load_boston(return_X_y=True) + X, y = _load_boston(return_X_y=True) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42) regressors = [('gbm1', lgb.LGBMRegressor(n_estimators=3)), ('gbm2', lgb.LGBMRegressor(n_estimators=3))] @@ -218,7 +241,7 @@ def test_stacking_regressor(self): self.assertEqual(len(reg.final_estimator_.feature_importances_), 15) def test_grid_search(self): - X, y = load_iris(return_X_y=True) + X, y = _load_iris(return_X_y=True) y = y.astype(str) # utilize label encoder at it's max power X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42) @@ -248,7 +271,7 @@ def test_grid_search(self): self.assertLessEqual(score, 1.) def test_random_search(self): - X, y = load_iris(return_X_y=True) + X, y = _load_iris(return_X_y=True) y = y.astype(str) # utilize label encoder at it's max power X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42) @@ -302,7 +325,7 @@ def test_multioutput_classifier(self): # sklearn < 0.23 does not have as_frame parameter @unittest.skipIf(sk_version < '0.23.0', 'scikit-learn version is less than 0.23') def test_multioutput_regressor(self): - bunch = load_linnerud(as_frame=True) # returns a Bunch instance + bunch = _load_linnerud(as_frame=True) # returns a Bunch instance X, y = bunch['data'], bunch['target'] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42) @@ -341,7 +364,7 @@ def test_classifier_chain(self): # sklearn < 0.23 does not have as_frame parameter @unittest.skipIf(sk_version < '0.23.0', 'scikit-learn version is less than 0.23') def test_regressor_chain(self): - bunch = load_linnerud(as_frame=True) # returns a Bunch instance + bunch = _load_linnerud(as_frame=True) # returns a Bunch instance X, y = bunch['data'], bunch['target'] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42) order = [2, 0, 1] @@ -358,7 +381,7 @@ def test_regressor_chain(self): self.assertIsInstance(regressor.booster_, lgb.Booster) def test_clone_and_property(self): - X, y = load_boston(return_X_y=True) + X, y = _load_boston(return_X_y=True) gbm = lgb.LGBMRegressor(n_estimators=10, silent=True) gbm.fit(X, y, verbose=False) @@ -366,7 +389,7 @@ def test_clone_and_property(self): self.assertIsInstance(gbm.booster_, lgb.Booster) self.assertIsInstance(gbm.feature_importances_, np.ndarray) - X, y = load_digits(n_class=2, return_X_y=True) + X, y = _load_digits(n_class=2, return_X_y=True) clf = lgb.LGBMClassifier(n_estimators=10, silent=True) clf.fit(X, y, verbose=False) self.assertListEqual(sorted(clf.classes_), [0, 1]) @@ -375,7 +398,7 @@ def test_clone_and_property(self): self.assertIsInstance(clf.feature_importances_, np.ndarray) def test_joblib(self): - X, y = load_boston(return_X_y=True) + X, y = _load_boston(return_X_y=True) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42) gbm = lgb.LGBMRegressor(n_estimators=10, objective=custom_asymmetric_obj, silent=True, importance_type='split') @@ -400,7 +423,7 @@ def test_joblib(self): np.testing.assert_allclose(pred_origin, pred_pickle) def test_random_state_object(self): - X, y = load_iris(return_X_y=True) + X, y = _load_iris(return_X_y=True) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42) state1 = np.random.RandomState(123) state2 = np.random.RandomState(123) @@ -433,14 +456,14 @@ def test_random_state_object(self): df1, df3) def test_feature_importances_single_leaf(self): - data = load_iris(return_X_y=False) + data = _load_iris(return_X_y=False) clf = lgb.LGBMClassifier(n_estimators=10) clf.fit(data.data, data.target) importances = clf.feature_importances_ self.assertEqual(len(importances), 4) def test_feature_importances_type(self): - data = load_iris(return_X_y=False) + data = _load_iris(return_X_y=False) clf = lgb.LGBMClassifier(n_estimators=10) clf.fit(data.data, data.target) clf.set_params(importance_type='split') @@ -564,7 +587,7 @@ def test_pandas_sparse(self): def test_predict(self): # With default params - iris = load_iris(return_X_y=False) + iris = _load_iris(return_X_y=False) X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, test_size=0.2, random_state=42) @@ -644,7 +667,7 @@ def test_predict(self): res_engine, res_sklearn_params) def test_evaluate_train_set(self): - X, y = load_boston(return_X_y=True) + X, y = _load_boston(return_X_y=True) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42) gbm = lgb.LGBMRegressor(n_estimators=10, silent=True) gbm.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_test, y_test)], verbose=False) @@ -657,7 +680,7 @@ def test_evaluate_train_set(self): self.assertIn('l2', gbm.evals_result_['valid_1']) def test_metrics(self): - X, y = load_boston(return_X_y=True) + X, y = _load_boston(return_X_y=True) params = {'n_estimators': 2, 'verbose': -1} params_fit = {'X': X, 'y': y, 'eval_set': (X, y), 'verbose': False} @@ -697,7 +720,7 @@ def test_metrics(self): self.assertIn('mape', gbm.evals_result_['training']) # non-default metric with multiple metrics in eval_metric for LGBMClassifier - X_classification, y_classification = load_breast_cancer(return_X_y=True) + X_classification, y_classification = _load_breast_cancer(return_X_y=True) params_classification = {'n_estimators': 2, 'verbose': -1, 'objective': 'binary', 'metric': 'binary_logloss'} params_fit_classification = {'X': X_classification, 'y': y_classification, @@ -880,7 +903,7 @@ def test_metrics(self): self.assertIn('mape', gbm.evals_result_['training']) self.assertIn('error', gbm.evals_result_['training']) - X, y = load_digits(n_class=3, return_X_y=True) + X, y = _load_digits(n_class=3, return_X_y=True) params_fit = {'X': X, 'y': y, 'eval_set': (X, y), 'verbose': False} # default metric and invalid binary metric is replaced with multiclass alternative @@ -907,7 +930,7 @@ def test_metrics(self): self.assertIn('multi_logloss', gbm.evals_result_['training']) self.assertIn('multi_error', gbm.evals_result_['training']) - X, y = load_digits(n_class=2, return_X_y=True) + X, y = _load_digits(n_class=2, return_X_y=True) params_fit = {'X': X, 'y': y, 'eval_set': (X, y), 'verbose': False} # default metric and invalid multiclass metric is replaced with binary alternative @@ -924,7 +947,7 @@ def test_metrics(self): def test_multiple_eval_metrics(self): - X, y = load_breast_cancer(return_X_y=True) + X, y = _load_breast_cancer(return_X_y=True) params = {'n_estimators': 2, 'verbose': -1, 'objective': 'binary', 'metric': 'binary_logloss'} params_fit = {'X': X, 'y': y, 'eval_set': (X, y), 'verbose': False} @@ -1003,7 +1026,7 @@ def fit_and_check(eval_set_names, metric_names, assumed_iteration, first_metric_ self.assertEqual(assumed_iteration if eval_set_name != 'training' else gbm.n_estimators, gbm.best_iteration_) - X, y = load_boston(return_X_y=True) + X, y = _load_boston(return_X_y=True) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) X_test1, X_test2, y_test1, y_test2 = train_test_split(X_test, y_test, test_size=0.5, random_state=72) params = {'n_estimators': 30, @@ -1084,7 +1107,7 @@ def fit_and_check(eval_set_names, metric_names, assumed_iteration, first_metric_ fit_and_check(['valid_0', 'valid_1'], ['l1', 'l2'], iter_min_l2, True) def test_class_weight(self): - X, y = load_digits(n_class=10, return_X_y=True) + X, y = _load_digits(n_class=10, return_X_y=True) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) y_train_str = y_train.astype('str') y_test_str = y_test.astype('str') @@ -1118,7 +1141,7 @@ def test_class_weight(self): gbm_str.evals_result_[eval_set][metric]) def test_continue_training_with_model(self): - X, y = load_digits(n_class=3, return_X_y=True) + X, y = _load_digits(n_class=3, return_X_y=True) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42) init_gbm = lgb.LGBMClassifier(n_estimators=5).fit(X_train, y_train, eval_set=(X_test, y_test), verbose=False) @@ -1133,7 +1156,7 @@ def test_continue_training_with_model(self): # sklearn < 0.22 requires passing "attributes" argument @unittest.skipIf(sk_version < '0.22.0', 'scikit-learn version is less than 0.22') def test_check_is_fitted(self): - X, y = load_digits(n_class=2, return_X_y=True) + X, y = _load_digits(n_class=2, return_X_y=True) est = lgb.LGBMModel(n_estimators=5, objective="binary") clf = lgb.LGBMClassifier(n_estimators=5) reg = lgb.LGBMRegressor(n_estimators=5) From f792015960b1b7b30a48df14d849179f644ac916 Mon Sep 17 00:00:00 2001 From: James Lamb Date: Mon, 26 Oct 2020 00:06:41 -0500 Subject: [PATCH 02/14] add profiling files to gitignore --- .gitignore | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.gitignore b/.gitignore index 6e4ffb35670a..f2f2c6c2316f 100644 --- a/.gitignore +++ b/.gitignore @@ -318,6 +318,8 @@ htmlcov/ .coverage.* .cache nosetests.xml +prof/ +*.prof coverage.xml *,cover .hypothesis/ From c71a30f7481e05a1c25c6d90872914beef66edee Mon Sep 17 00:00:00 2001 From: James Lamb Date: Mon, 26 Oct 2020 12:29:36 -0500 Subject: [PATCH 03/14] just use cache() --- tests/python_package_test/test_basic.py | 4 ++-- tests/python_package_test/test_engine.py | 10 +++++----- tests/python_package_test/test_sklearn.py | 12 ++++++------ 3 files changed, 13 insertions(+), 13 deletions(-) diff --git a/tests/python_package_test/test_basic.py b/tests/python_package_test/test_basic.py index d68651d3b236..633934268f90 100644 --- a/tests/python_package_test/test_basic.py +++ b/tests/python_package_test/test_basic.py @@ -3,7 +3,7 @@ import tempfile import unittest -from functools import lru_cache +from functools import cache import lightgbm as lgb import numpy as np @@ -13,7 +13,7 @@ from sklearn.model_selection import train_test_split -@lru_cache +@cache def _load_breast_cancer(**kwargs): return load_breast_cancer(**kwargs) diff --git a/tests/python_package_test/test_engine.py b/tests/python_package_test/test_engine.py index fa069e830d69..096246a753bc 100644 --- a/tests/python_package_test/test_engine.py +++ b/tests/python_package_test/test_engine.py @@ -7,7 +7,7 @@ import random import unittest -from functools import lru_cache +from functools import cache import lightgbm as lgb import numpy as np @@ -53,19 +53,19 @@ def categorize(continuous_x): return np.digitize(continuous_x, bins=np.arange(0, 1, 0.01)) -@lru_cache +@cache def _load_boston(**kwargs): return load_boston(**kwargs) -@lru_cache +@cache def _load_breast_cancer(**kwargs): return load_breast_cancer(**kwargs) -@lru_cache +@cache def _load_digits(**kwargs): return load_digits(**kwargs) -@lru_cache +@cache def _load_iris(**kwargs): return load_iris(**kwargs) diff --git a/tests/python_package_test/test_sklearn.py b/tests/python_package_test/test_sklearn.py index eabf8b1e3f22..51ff70cfbf50 100644 --- a/tests/python_package_test/test_sklearn.py +++ b/tests/python_package_test/test_sklearn.py @@ -6,7 +6,7 @@ import unittest import warnings -from functools import lru_cache +from functools import cache import lightgbm as lgb import numpy as np @@ -76,23 +76,23 @@ def multi_logloss(y_true, y_pred): return np.mean([-math.log(y_pred[i][y]) for i, y in enumerate(y_true)]) -@lru_cache +@cache def _load_boston(**kwargs): return load_boston(**kwargs) -@lru_cache +@cache def _load_breast_cancer(**kwargs): return load_breast_cancer(**kwargs) -@lru_cache +@cache def _load_digits(**kwargs): return load_digits(**kwargs) -@lru_cache +@cache def _load_iris(**kwargs): return load_iris(**kwargs) -@lru_cache +@cache def _load_linnerud(**kwargs): return load_linnerud(**kwargs) From 979b76a0078472dccdfbe15dc4e73078e798ffa2 Mon Sep 17 00:00:00 2001 From: James Lamb Date: Mon, 26 Oct 2020 12:38:18 -0500 Subject: [PATCH 04/14] default on cache size --- tests/python_package_test/test_basic.py | 4 ++-- tests/python_package_test/test_engine.py | 10 +++++----- tests/python_package_test/test_sklearn.py | 12 ++++++------ 3 files changed, 13 insertions(+), 13 deletions(-) diff --git a/tests/python_package_test/test_basic.py b/tests/python_package_test/test_basic.py index 633934268f90..62fd704a8de0 100644 --- a/tests/python_package_test/test_basic.py +++ b/tests/python_package_test/test_basic.py @@ -3,7 +3,7 @@ import tempfile import unittest -from functools import cache +from functools import lru_cache import lightgbm as lgb import numpy as np @@ -13,7 +13,7 @@ from sklearn.model_selection import train_test_split -@cache +@lru_cache(maxsize=None) def _load_breast_cancer(**kwargs): return load_breast_cancer(**kwargs) diff --git a/tests/python_package_test/test_engine.py b/tests/python_package_test/test_engine.py index 096246a753bc..1669fd2bfb3d 100644 --- a/tests/python_package_test/test_engine.py +++ b/tests/python_package_test/test_engine.py @@ -7,7 +7,7 @@ import random import unittest -from functools import cache +from functools import lru_cache import lightgbm as lgb import numpy as np @@ -53,19 +53,19 @@ def categorize(continuous_x): return np.digitize(continuous_x, bins=np.arange(0, 1, 0.01)) -@cache +@lru_cache(maxsize=None) def _load_boston(**kwargs): return load_boston(**kwargs) -@cache +@lru_cache(maxsize=None) def _load_breast_cancer(**kwargs): return load_breast_cancer(**kwargs) -@cache +@lru_cache(maxsize=None) def _load_digits(**kwargs): return load_digits(**kwargs) -@cache +@lru_cache(maxsize=None) def _load_iris(**kwargs): return load_iris(**kwargs) diff --git a/tests/python_package_test/test_sklearn.py b/tests/python_package_test/test_sklearn.py index 51ff70cfbf50..68c65253c02d 100644 --- a/tests/python_package_test/test_sklearn.py +++ b/tests/python_package_test/test_sklearn.py @@ -6,7 +6,7 @@ import unittest import warnings -from functools import cache +from functools import lru_cache import lightgbm as lgb import numpy as np @@ -76,23 +76,23 @@ def multi_logloss(y_true, y_pred): return np.mean([-math.log(y_pred[i][y]) for i, y in enumerate(y_true)]) -@cache +@lru_cache(maxsize=None) def _load_boston(**kwargs): return load_boston(**kwargs) -@cache +@lru_cache(maxsize=None) def _load_breast_cancer(**kwargs): return load_breast_cancer(**kwargs) -@cache +@lru_cache(maxsize=None) def _load_digits(**kwargs): return load_digits(**kwargs) -@cache +@lru_cache(maxsize=None) def _load_iris(**kwargs): return load_iris(**kwargs) -@cache +@lru_cache(maxsize=None) def _load_linnerud(**kwargs): return load_linnerud(**kwargs) From a86a6c22287977d9654f9cd38a028d746f996a91 Mon Sep 17 00:00:00 2001 From: James Lamb Date: Mon, 26 Oct 2020 16:32:15 -0500 Subject: [PATCH 05/14] patch lru_cache on Python 2.7 --- tests/python_package_test/test_basic.py | 15 +++++++++++++-- tests/python_package_test/test_engine.py | 15 +++++++++++++-- tests/python_package_test/test_sklearn.py | 15 +++++++++++++-- 3 files changed, 39 insertions(+), 6 deletions(-) diff --git a/tests/python_package_test/test_basic.py b/tests/python_package_test/test_basic.py index 62fd704a8de0..6dd2658b2c30 100644 --- a/tests/python_package_test/test_basic.py +++ b/tests/python_package_test/test_basic.py @@ -3,8 +3,6 @@ import tempfile import unittest -from functools import lru_cache - import lightgbm as lgb import numpy as np @@ -12,6 +10,19 @@ from sklearn.datasets import load_breast_cancer, dump_svmlight_file, load_svmlight_file from sklearn.model_selection import train_test_split +try: + from functools import lru_cache +except ImportError: + warnings.warn("Could not import functools.lru_cache", RuntimeWarning) + def lru_cache(user_function, maxsize=None): + @wraps(user_function) + def wrapper(*args, **kwargs): + arg_key = tuple(args, [item for item in kwargs.items()]) + if arg_key not in cache: + cache[arg_key] = user_function(*args) + return cache[arg_key] + return wrapper + @lru_cache(maxsize=None) def _load_breast_cancer(**kwargs): diff --git a/tests/python_package_test/test_engine.py b/tests/python_package_test/test_engine.py index 1669fd2bfb3d..d76b9ac0a58c 100644 --- a/tests/python_package_test/test_engine.py +++ b/tests/python_package_test/test_engine.py @@ -7,8 +7,6 @@ import random import unittest -from functools import lru_cache - import lightgbm as lgb import numpy as np from scipy.sparse import csr_matrix, isspmatrix_csr, isspmatrix_csc @@ -22,6 +20,19 @@ except ImportError: import pickle +try: + from functools import lru_cache +except ImportError: + warnings.warn("Could not import functools.lru_cache", RuntimeWarning) + def lru_cache(user_function, maxsize=None): + @wraps(user_function) + def wrapper(*args, **kwargs): + arg_key = tuple(args, [item for item in kwargs.items()]) + if arg_key not in cache: + cache[arg_key] = user_function(*args) + return cache[arg_key] + return wrapper + decreasing_generator = itertools.count(0, -1) diff --git a/tests/python_package_test/test_sklearn.py b/tests/python_package_test/test_sklearn.py index 68c65253c02d..88ae92002274 100644 --- a/tests/python_package_test/test_sklearn.py +++ b/tests/python_package_test/test_sklearn.py @@ -6,8 +6,6 @@ import unittest import warnings -from functools import lru_cache - import lightgbm as lgb import numpy as np from sklearn import __version__ as sk_version @@ -24,6 +22,19 @@ check_parameters_default_constructible) from sklearn.utils.validation import check_is_fitted +try: + from functools import lru_cache +except ImportError: + warnings.warn("Could not import functools.lru_cache", RuntimeWarning) + def lru_cache(user_function, maxsize=None): + @wraps(user_function) + def wrapper(*args, **kwargs): + arg_key = tuple(args, [item for item in kwargs.items()]) + if arg_key not in cache: + cache[arg_key] = user_function(*args) + return cache[arg_key] + return wrapper + decreasing_generator = itertools.count(0, -1) From afff7d3638c94c9cc7ecfa2bd9db292142191f99 Mon Sep 17 00:00:00 2001 From: James Lamb Date: Mon, 26 Oct 2020 16:37:07 -0500 Subject: [PATCH 06/14] linting --- tests/python_package_test/test_basic.py | 1 + tests/python_package_test/test_engine.py | 4 ++++ tests/python_package_test/test_sklearn.py | 5 +++++ 3 files changed, 10 insertions(+) diff --git a/tests/python_package_test/test_basic.py b/tests/python_package_test/test_basic.py index 6dd2658b2c30..24ac8ccded1a 100644 --- a/tests/python_package_test/test_basic.py +++ b/tests/python_package_test/test_basic.py @@ -14,6 +14,7 @@ from functools import lru_cache except ImportError: warnings.warn("Could not import functools.lru_cache", RuntimeWarning) + def lru_cache(user_function, maxsize=None): @wraps(user_function) def wrapper(*args, **kwargs): diff --git a/tests/python_package_test/test_engine.py b/tests/python_package_test/test_engine.py index d76b9ac0a58c..42241292d43a 100644 --- a/tests/python_package_test/test_engine.py +++ b/tests/python_package_test/test_engine.py @@ -24,6 +24,7 @@ from functools import lru_cache except ImportError: warnings.warn("Could not import functools.lru_cache", RuntimeWarning) + def lru_cache(user_function, maxsize=None): @wraps(user_function) def wrapper(*args, **kwargs): @@ -68,14 +69,17 @@ def categorize(continuous_x): def _load_boston(**kwargs): return load_boston(**kwargs) + @lru_cache(maxsize=None) def _load_breast_cancer(**kwargs): return load_breast_cancer(**kwargs) + @lru_cache(maxsize=None) def _load_digits(**kwargs): return load_digits(**kwargs) + @lru_cache(maxsize=None) def _load_iris(**kwargs): return load_iris(**kwargs) diff --git a/tests/python_package_test/test_sklearn.py b/tests/python_package_test/test_sklearn.py index 88ae92002274..d9a3918950c7 100644 --- a/tests/python_package_test/test_sklearn.py +++ b/tests/python_package_test/test_sklearn.py @@ -26,6 +26,7 @@ from functools import lru_cache except ImportError: warnings.warn("Could not import functools.lru_cache", RuntimeWarning) + def lru_cache(user_function, maxsize=None): @wraps(user_function) def wrapper(*args, **kwargs): @@ -91,18 +92,22 @@ def multi_logloss(y_true, y_pred): def _load_boston(**kwargs): return load_boston(**kwargs) + @lru_cache(maxsize=None) def _load_breast_cancer(**kwargs): return load_breast_cancer(**kwargs) + @lru_cache(maxsize=None) def _load_digits(**kwargs): return load_digits(**kwargs) + @lru_cache(maxsize=None) def _load_iris(**kwargs): return load_iris(**kwargs) + @lru_cache(maxsize=None) def _load_linnerud(**kwargs): return load_linnerud(**kwargs) From 25a9bc7736010eb4278583c576dc761599c0040d Mon Sep 17 00:00:00 2001 From: James Lamb Date: Mon, 26 Oct 2020 22:38:42 -0500 Subject: [PATCH 07/14] reduce duplicated code --- tests/python_package_test/__init__.py | 0 tests/python_package_test/test_basic.py | 14 +------------- tests/python_package_test/test_engine.py | 14 +------------- tests/python_package_test/test_sklearn.py | 14 +------------- tests/python_package_test/utils.py | 14 ++++++++++++++ 5 files changed, 17 insertions(+), 39 deletions(-) create mode 100644 tests/python_package_test/__init__.py create mode 100644 tests/python_package_test/utils.py diff --git a/tests/python_package_test/__init__.py b/tests/python_package_test/__init__.py new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/tests/python_package_test/test_basic.py b/tests/python_package_test/test_basic.py index 1c0accd8a356..19a4debbb914 100644 --- a/tests/python_package_test/test_basic.py +++ b/tests/python_package_test/test_basic.py @@ -10,19 +10,7 @@ from sklearn.datasets import load_breast_cancer, dump_svmlight_file, load_svmlight_file from sklearn.model_selection import train_test_split -try: - from functools import lru_cache -except ImportError: - warnings.warn("Could not import functools.lru_cache", RuntimeWarning) - - def lru_cache(user_function, maxsize=None): - @wraps(user_function) - def wrapper(*args, **kwargs): - arg_key = tuple(args, [item for item in kwargs.items()]) - if arg_key not in cache: - cache[arg_key] = user_function(*args) - return cache[arg_key] - return wrapper +from .utils import lru_cache @lru_cache(maxsize=None) diff --git a/tests/python_package_test/test_engine.py b/tests/python_package_test/test_engine.py index 42241292d43a..67696174a711 100644 --- a/tests/python_package_test/test_engine.py +++ b/tests/python_package_test/test_engine.py @@ -20,19 +20,7 @@ except ImportError: import pickle -try: - from functools import lru_cache -except ImportError: - warnings.warn("Could not import functools.lru_cache", RuntimeWarning) - - def lru_cache(user_function, maxsize=None): - @wraps(user_function) - def wrapper(*args, **kwargs): - arg_key = tuple(args, [item for item in kwargs.items()]) - if arg_key not in cache: - cache[arg_key] = user_function(*args) - return cache[arg_key] - return wrapper +from .utils import lru_cache decreasing_generator = itertools.count(0, -1) diff --git a/tests/python_package_test/test_sklearn.py b/tests/python_package_test/test_sklearn.py index d9a3918950c7..498009620bf3 100644 --- a/tests/python_package_test/test_sklearn.py +++ b/tests/python_package_test/test_sklearn.py @@ -22,19 +22,7 @@ check_parameters_default_constructible) from sklearn.utils.validation import check_is_fitted -try: - from functools import lru_cache -except ImportError: - warnings.warn("Could not import functools.lru_cache", RuntimeWarning) - - def lru_cache(user_function, maxsize=None): - @wraps(user_function) - def wrapper(*args, **kwargs): - arg_key = tuple(args, [item for item in kwargs.items()]) - if arg_key not in cache: - cache[arg_key] = user_function(*args) - return cache[arg_key] - return wrapper +from .utils import lru_cache decreasing_generator = itertools.count(0, -1) diff --git a/tests/python_package_test/utils.py b/tests/python_package_test/utils.py new file mode 100644 index 000000000000..428b630e41e3 --- /dev/null +++ b/tests/python_package_test/utils.py @@ -0,0 +1,14 @@ + +try: + from functools import lru_cache +except ImportError: + warnings.warn("Could not import functools.lru_cache", RuntimeWarning) + + def lru_cache(user_function, maxsize=None): + @wraps(user_function) + def wrapper(*args, **kwargs): + arg_key = tuple(args, [item for item in kwargs.items()]) + if arg_key not in cache: + cache[arg_key] = user_function(*args) + return cache[arg_key] + return wrapper From 3af0a5c1941f9802334348a86393b0b4b235bccc Mon Sep 17 00:00:00 2001 From: James Lamb Date: Mon, 26 Oct 2020 22:55:33 -0500 Subject: [PATCH 08/14] missing warnings --- tests/python_package_test/utils.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/python_package_test/utils.py b/tests/python_package_test/utils.py index 428b630e41e3..cb8e02abc7ff 100644 --- a/tests/python_package_test/utils.py +++ b/tests/python_package_test/utils.py @@ -2,6 +2,7 @@ try: from functools import lru_cache except ImportError: + import warnings warnings.warn("Could not import functools.lru_cache", RuntimeWarning) def lru_cache(user_function, maxsize=None): From 3394222408a2ca004c3c4294aa7f9f844bf72c37 Mon Sep 17 00:00:00 2001 From: James Lamb Date: Tue, 27 Oct 2020 23:09:13 -0500 Subject: [PATCH 09/14] fix imports --- tests/python_package_test/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/python_package_test/utils.py b/tests/python_package_test/utils.py index cb8e02abc7ff..a2e0ba629e36 100644 --- a/tests/python_package_test/utils.py +++ b/tests/python_package_test/utils.py @@ -1,8 +1,8 @@ - try: from functools import lru_cache except ImportError: import warnings + from functools import wraps warnings.warn("Could not import functools.lru_cache", RuntimeWarning) def lru_cache(user_function, maxsize=None): From 3a5fe6094083d4603586ebe579ba5d0342106917 Mon Sep 17 00:00:00 2001 From: James Lamb Date: Wed, 28 Oct 2020 00:56:33 -0500 Subject: [PATCH 10/14] fix lru_cache backport --- tests/python_package_test/utils.py | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/tests/python_package_test/utils.py b/tests/python_package_test/utils.py index a2e0ba629e36..57f97b8b3ce7 100644 --- a/tests/python_package_test/utils.py +++ b/tests/python_package_test/utils.py @@ -2,14 +2,16 @@ from functools import lru_cache except ImportError: import warnings - from functools import wraps warnings.warn("Could not import functools.lru_cache", RuntimeWarning) - def lru_cache(user_function, maxsize=None): - @wraps(user_function) - def wrapper(*args, **kwargs): - arg_key = tuple(args, [item for item in kwargs.items()]) - if arg_key not in cache: - cache[arg_key] = user_function(*args) - return cache[arg_key] - return wrapper + def lru_cache(maxsize=None): + cache = {} + + def _lru_wrapper(user_function): + def wrapper(*args, **kwargs): + arg_key = (args, tuple([item for item in kwargs.items()])) + if arg_key not in cache: + cache[arg_key] = user_function(*args) + return cache[arg_key] + return wrapper + return _lru_wrapper From c1cb1b26492ef4c393dc84f6e902ee0863b6cf83 Mon Sep 17 00:00:00 2001 From: James Lamb Date: Wed, 28 Oct 2020 10:10:19 -0500 Subject: [PATCH 11/14] missing kwargs --- tests/python_package_test/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/python_package_test/utils.py b/tests/python_package_test/utils.py index 57f97b8b3ce7..76347c829257 100644 --- a/tests/python_package_test/utils.py +++ b/tests/python_package_test/utils.py @@ -11,7 +11,7 @@ def _lru_wrapper(user_function): def wrapper(*args, **kwargs): arg_key = (args, tuple([item for item in kwargs.items()])) if arg_key not in cache: - cache[arg_key] = user_function(*args) + cache[arg_key] = user_function(*args, **kwargs) return cache[arg_key] return wrapper return _lru_wrapper From 16ec8aa888f375355db9ce6ab2a07e0070ddd483 Mon Sep 17 00:00:00 2001 From: James Lamb Date: Wed, 28 Oct 2020 21:39:29 +0000 Subject: [PATCH 12/14] Apply suggestions from code review Co-authored-by: Nikita Titov --- tests/python_package_test/utils.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/python_package_test/utils.py b/tests/python_package_test/utils.py index 76347c829257..6c3e4601a1dd 100644 --- a/tests/python_package_test/utils.py +++ b/tests/python_package_test/utils.py @@ -1,3 +1,4 @@ +# coding: utf-8 try: from functools import lru_cache except ImportError: @@ -9,7 +10,7 @@ def lru_cache(maxsize=None): def _lru_wrapper(user_function): def wrapper(*args, **kwargs): - arg_key = (args, tuple([item for item in kwargs.items()])) + arg_key = (args, tuple(kwargs.items())) if arg_key not in cache: cache[arg_key] = user_function(*args, **kwargs) return cache[arg_key] From 4c2e7befa3ee4fe4f518699a8d1f81db0060d109 Mon Sep 17 00:00:00 2001 From: James Lamb Date: Wed, 28 Oct 2020 17:38:08 -0500 Subject: [PATCH 13/14] reduce duplicated code --- tests/python_package_test/test_basic.py | 15 ++-- tests/python_package_test/test_engine.py | 100 +++++++++------------- tests/python_package_test/test_sklearn.py | 89 +++++++------------ tests/python_package_test/utils.py | 27 ++++++ 4 files changed, 103 insertions(+), 128 deletions(-) diff --git a/tests/python_package_test/test_basic.py b/tests/python_package_test/test_basic.py index 19a4debbb914..f53933021b14 100644 --- a/tests/python_package_test/test_basic.py +++ b/tests/python_package_test/test_basic.py @@ -7,21 +7,16 @@ import numpy as np from scipy import sparse -from sklearn.datasets import load_breast_cancer, dump_svmlight_file, load_svmlight_file +from sklearn.datasets import dump_svmlight_file, load_svmlight_file from sklearn.model_selection import train_test_split -from .utils import lru_cache - - -@lru_cache(maxsize=None) -def _load_breast_cancer(**kwargs): - return load_breast_cancer(**kwargs) +from .utils import load_breast_cancer, lru_cache class TestBasic(unittest.TestCase): def test(self): - X_train, X_test, y_train, y_test = train_test_split(*_load_breast_cancer(return_X_y=True), + X_train, X_test, y_train, y_test = train_test_split(*load_breast_cancer(return_X_y=True), test_size=0.1, random_state=2) train_data = lgb.Dataset(X_train, label=y_train) valid_data = train_data.create_valid(X_test, label=y_test) @@ -93,7 +88,7 @@ def test(self): os.remove(tname) def test_chunked_dataset(self): - X_train, X_test, y_train, y_test = train_test_split(*_load_breast_cancer(return_X_y=True), test_size=0.1, random_state=2) + X_train, X_test, y_train, y_test = train_test_split(*load_breast_cancer(return_X_y=True), test_size=0.1, random_state=2) chunk_size = X_train.shape[0] // 10 + 1 X_train = [X_train[i * chunk_size:(i + 1) * chunk_size, :] for i in range(X_train.shape[0] // chunk_size + 1)] @@ -316,7 +311,7 @@ def check_asserts(data): self.assertAlmostEqual(data.label[1], data.weight[1]) self.assertListEqual(data.feature_name, data.get_feature_name()) - X, y = _load_breast_cancer(return_X_y=True) + X, y = load_breast_cancer(return_X_y=True) sequence = np.ones(y.shape[0]) sequence[0] = np.nan sequence[1] = np.inf diff --git a/tests/python_package_test/test_engine.py b/tests/python_package_test/test_engine.py index 67696174a711..3cb4c7ff55c3 100644 --- a/tests/python_package_test/test_engine.py +++ b/tests/python_package_test/test_engine.py @@ -10,8 +10,7 @@ import lightgbm as lgb import numpy as np from scipy.sparse import csr_matrix, isspmatrix_csr, isspmatrix_csc -from sklearn.datasets import (load_boston, load_breast_cancer, load_digits, - load_iris, load_svmlight_file, make_multilabel_classification) +from sklearn.datasets import load_svmlight_file, make_multilabel_classification from sklearn.metrics import log_loss, mean_absolute_error, mean_squared_error, roc_auc_score, average_precision_score from sklearn.model_selection import train_test_split, TimeSeriesSplit, GroupKFold @@ -20,7 +19,7 @@ except ImportError: import pickle -from .utils import lru_cache +from .utils import load_boston, load_breast_cancer, load_digits, load_iris, lru_cache decreasing_generator = itertools.count(0, -1) @@ -53,29 +52,9 @@ def categorize(continuous_x): return np.digitize(continuous_x, bins=np.arange(0, 1, 0.01)) -@lru_cache(maxsize=None) -def _load_boston(**kwargs): - return load_boston(**kwargs) - - -@lru_cache(maxsize=None) -def _load_breast_cancer(**kwargs): - return load_breast_cancer(**kwargs) - - -@lru_cache(maxsize=None) -def _load_digits(**kwargs): - return load_digits(**kwargs) - - -@lru_cache(maxsize=None) -def _load_iris(**kwargs): - return load_iris(**kwargs) - - class TestEngine(unittest.TestCase): def test_binary(self): - X, y = _load_breast_cancer(return_X_y=True) + X, y = load_breast_cancer(return_X_y=True) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42) params = { 'objective': 'binary', @@ -97,7 +76,7 @@ def test_binary(self): self.assertAlmostEqual(evals_result['valid_0']['binary_logloss'][-1], ret, places=5) def test_rf(self): - X, y = _load_breast_cancer(return_X_y=True) + X, y = load_breast_cancer(return_X_y=True) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42) params = { 'boosting_type': 'rf', @@ -122,7 +101,7 @@ def test_rf(self): self.assertAlmostEqual(evals_result['valid_0']['binary_logloss'][-1], ret, places=5) def test_regression(self): - X, y = _load_boston(return_X_y=True) + X, y = load_boston(return_X_y=True) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42) params = { 'metric': 'l2', @@ -399,7 +378,7 @@ def test_categorical_non_zero_inputs(self): self.assertAlmostEqual(evals_result['valid_0']['auc'][-1], ret, places=5) def test_multiclass(self): - X, y = _load_digits(n_class=10, return_X_y=True) + X, y = load_digits(n_class=10, return_X_y=True) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42) params = { 'objective': 'multiclass', @@ -420,7 +399,7 @@ def test_multiclass(self): self.assertAlmostEqual(evals_result['valid_0']['multi_logloss'][-1], ret, places=5) def test_multiclass_rf(self): - X, y = _load_digits(n_class=10, return_X_y=True) + X, y = load_digits(n_class=10, return_X_y=True) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42) params = { 'boosting_type': 'rf', @@ -448,7 +427,7 @@ def test_multiclass_rf(self): self.assertAlmostEqual(evals_result['valid_0']['multi_logloss'][-1], ret, places=5) def test_multiclass_prediction_early_stopping(self): - X, y = _load_digits(n_class=10, return_X_y=True) + X, y = load_digits(n_class=10, return_X_y=True) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42) params = { 'objective': 'multiclass', @@ -474,7 +453,7 @@ def test_multiclass_prediction_early_stopping(self): self.assertLess(ret, 0.2) def test_multi_class_error(self): - X, y = _load_digits(n_class=10, return_X_y=True) + X, y = load_digits(n_class=10, return_X_y=True) params = {'objective': 'multiclass', 'num_classes': 10, 'metric': 'multi_error', 'num_leaves': 4, 'verbose': -1} lgb_data = lgb.Dataset(X, label=y) @@ -519,7 +498,7 @@ def test_multi_class_error(self): def test_auc_mu(self): # should give same result as binary auc for 2 classes - X, y = _load_digits(n_class=10, return_X_y=True) + X, y = load_digits(n_class=10, return_X_y=True) y_new = np.zeros((len(y))) y_new[y != 0] = 1 lgb_X = lgb.Dataset(X, label=y_new) @@ -597,7 +576,7 @@ def test_auc_mu(self): self.assertNotEqual(results_weight['training']['auc_mu'][-1], results_no_weight['training']['auc_mu'][-1]) def test_early_stopping(self): - X, y = _load_breast_cancer(return_X_y=True) + X, y = load_breast_cancer(return_X_y=True) params = { 'objective': 'binary', 'metric': 'binary_logloss', @@ -629,7 +608,7 @@ def test_early_stopping(self): self.assertIn('binary_logloss', gbm.best_score[valid_set_name]) def test_continue_train(self): - X, y = _load_boston(return_X_y=True) + X, y = load_boston(return_X_y=True) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42) params = { 'objective': 'regression', @@ -657,7 +636,7 @@ def test_continue_train(self): os.remove(model_name) def test_continue_train_reused_dataset(self): - X, y = _load_boston(return_X_y=True) + X, y = load_boston(return_X_y=True) params = { 'objective': 'regression', 'verbose': -1 @@ -670,7 +649,7 @@ def test_continue_train_reused_dataset(self): self.assertEqual(gbm.current_iteration(), 20) def test_continue_train_dart(self): - X, y = _load_boston(return_X_y=True) + X, y = load_boston(return_X_y=True) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42) params = { 'boosting_type': 'dart', @@ -693,7 +672,7 @@ def test_continue_train_dart(self): self.assertAlmostEqual(evals_result['valid_0']['l1'][-1], ret, places=5) def test_continue_train_multiclass(self): - X, y = _load_iris(return_X_y=True) + X, y = load_iris(return_X_y=True) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42) params = { 'objective': 'multiclass', @@ -716,7 +695,7 @@ def test_continue_train_multiclass(self): self.assertAlmostEqual(evals_result['valid_0']['multi_logloss'][-1], ret, places=5) def test_cv(self): - X_train, y_train = _load_boston(return_X_y=True) + X_train, y_train = load_boston(return_X_y=True) params = {'verbose': -1} lgb_train = lgb.Dataset(X_train, y_train) # shuffle = False, override metric in params @@ -775,7 +754,7 @@ def test_cv(self): np.testing.assert_allclose(cv_res_lambda['ndcg@3-mean'], cv_res_lambda_obj['ndcg@3-mean']) def test_cvbooster(self): - X, y = _load_breast_cancer(return_X_y=True) + X, y = load_breast_cancer(return_X_y=True) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42) params = { 'objective': 'binary', @@ -819,7 +798,7 @@ def test_cvbooster(self): self.assertLess(ret, 0.15) def test_feature_name(self): - X_train, y_train = _load_boston(return_X_y=True) + X_train, y_train = load_boston(return_X_y=True) params = {'verbose': -1} lgb_train = lgb.Dataset(X_train, y_train) feature_names = ['f_' + str(i) for i in range(X_train.shape[-1])] @@ -847,7 +826,7 @@ def test_feature_name_with_non_ascii(self): def test_save_load_copy_pickle(self): def train_and_predict(init_model=None, return_model=False): - X, y = _load_boston(return_X_y=True) + X, y = load_boston(return_X_y=True) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42) params = { 'objective': 'regression', @@ -1011,7 +990,7 @@ def test_reference_chain(self): self.assertEqual(len(evals_result['valid_1']['rmse']), 20) def test_contribs(self): - X, y = _load_breast_cancer(return_X_y=True) + X, y = load_breast_cancer(return_X_y=True) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42) params = { 'objective': 'binary', @@ -1394,7 +1373,7 @@ def test_small_max_bin(self): np.random.seed() # reset seed def test_refit(self): - X, y = _load_breast_cancer(return_X_y=True) + X, y = load_breast_cancer(return_X_y=True) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42) params = { 'objective': 'binary', @@ -1410,7 +1389,7 @@ def test_refit(self): self.assertGreater(err_pred, new_err_pred) def test_mape_rf(self): - X, y = _load_boston(return_X_y=True) + X, y = load_boston(return_X_y=True) params = { 'boosting_type': 'rf', 'objective': 'mape', @@ -1427,7 +1406,7 @@ def test_mape_rf(self): self.assertGreater(pred_mean, 20) def test_mape_dart(self): - X, y = _load_boston(return_X_y=True) + X, y = load_boston(return_X_y=True) params = { 'boosting_type': 'dart', 'objective': 'mape', @@ -1506,7 +1485,7 @@ def preprocess_data(dtrain, dtest, params): params['num_class'] = 4 return dtrain, dtest, params - X, y = _load_iris(return_X_y=True) + X, y = load_iris(return_X_y=True) dataset = lgb.Dataset(X, y, free_raw_data=False) params = {'objective': 'multiclass', 'num_class': 3, 'verbose': -1} results = lgb.cv(params, dataset, num_boost_round=10, fpreproc=preprocess_data) @@ -1514,7 +1493,7 @@ def preprocess_data(dtrain, dtest, params): self.assertEqual(len(results['multi_logloss-mean']), 10) def test_metrics(self): - X, y = _load_digits(n_class=2, return_X_y=True) + X, y = load_digits(n_class=2, return_X_y=True) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42) lgb_train = lgb.Dataset(X_train, y_train, silent=True) lgb_valid = lgb.Dataset(X_test, y_test, reference=lgb_train, silent=True) @@ -1820,7 +1799,7 @@ def train_booster(params=params_obj_verbose, **kwargs): self.assertEqual(len(evals_result), 1) self.assertIn('error', evals_result['valid_0']) - X, y = _load_digits(n_class=3, return_X_y=True) + X, y = load_digits(n_class=3, return_X_y=True) lgb_train = lgb.Dataset(X, y, silent=True) obj_multi_aliases = ['multiclass', 'softmax', 'multiclassova', 'multiclass_ova', 'ova', 'ovr'] @@ -1888,7 +1867,7 @@ def train_booster(params=params_obj_verbose, **kwargs): params_class_3_verbose, metrics='binary_error', fobj=dummy_obj) def test_multiple_feval_train(self): - X, y = _load_breast_cancer(return_X_y=True) + X, y = load_breast_cancer(return_X_y=True) params = {'verbose': -1, 'objective': 'binary', 'metric': 'binary_logloss'} @@ -1911,7 +1890,7 @@ def test_multiple_feval_train(self): self.assertIn('decreasing_metric', evals_result['valid_0']) def test_multiple_feval_cv(self): - X, y = _load_breast_cancer(return_X_y=True) + X, y = load_breast_cancer(return_X_y=True) params = {'verbose': -1, 'objective': 'binary', 'metric': 'binary_logloss'} @@ -1934,7 +1913,7 @@ def test_multiple_feval_cv(self): @unittest.skipIf(psutil.virtual_memory().available / 1024 / 1024 / 1024 < 3, 'not enough RAM') def test_model_size(self): - X, y = _load_boston(return_X_y=True) + X, y = load_boston(return_X_y=True) data = lgb.Dataset(X, y) bst = lgb.train({'verbose': -1}, data, num_boost_round=2) y_pred = bst.predict(X) @@ -1960,7 +1939,7 @@ def test_model_size(self): self.skipTest('not enough RAM') def test_get_split_value_histogram(self): - X, y = _load_boston(return_X_y=True) + X, y = load_boston(return_X_y=True) lgb_train = lgb.Dataset(X, y, categorical_feature=[2]) gbm = lgb.train({'verbose': -1}, lgb_train, num_boost_round=20) # test XGBoost-style return value @@ -2070,7 +2049,7 @@ def metrics_combination_cv_regression(metric_list, assumed_iteration, eval_train_metric=eval_train_metric) self.assertEqual(assumed_iteration, len(ret[list(ret.keys())[0]])) - X, y = _load_boston(return_X_y=True) + X, y = load_boston(return_X_y=True) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) X_test1, X_test2, y_test1, y_test2 = train_test_split(X_test, y_test, test_size=0.5, random_state=73) lgb_train = lgb.Dataset(X_train, y_train) @@ -2148,7 +2127,7 @@ def metrics_combination_cv_regression(metric_list, assumed_iteration, decreasing_metric(preds, train_data)]) def test_node_level_subcol(self): - X, y = _load_breast_cancer(return_X_y=True) + X, y = load_breast_cancer(return_X_y=True) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42) params = { 'objective': 'binary', @@ -2329,7 +2308,7 @@ def test_dataset_params_with_reference(self): def test_extra_trees(self): # check extra trees increases regularization - X, y = _load_boston(return_X_y=True) + X, y = load_boston(return_X_y=True) lgb_x = lgb.Dataset(X, label=y) params = {'objective': 'regression', 'num_leaves': 32, @@ -2347,7 +2326,7 @@ def test_extra_trees(self): def test_path_smoothing(self): # check path smoothing increases regularization - X, y = _load_boston(return_X_y=True) + X, y = load_boston(return_X_y=True) lgb_x = lgb.Dataset(X, label=y) params = {'objective': 'regression', 'num_leaves': 32, @@ -2369,7 +2348,7 @@ def _imptcs_to_numpy(X, impcts_dict): cols = ['Column_' + str(i) for i in range(X.shape[1])] return [impcts_dict.get(col, 0.) for col in cols] - X, y = _load_breast_cancer(return_X_y=True) + X, y = load_breast_cancer(return_X_y=True) data = lgb.Dataset(X, label=y) num_trees = 10 bst = lgb.train({"objective": "binary", "verbose": -1}, data, num_trees) @@ -2414,7 +2393,7 @@ def _imptcs_to_numpy(X, impcts_dict): self.assertIsNone(tree_df.loc[0, col]) def test_interaction_constraints(self): - X, y = _load_boston(return_X_y=True) + X, y = load_boston(return_X_y=True) num_features = X.shape[1] train_data = lgb.Dataset(X, label=y) # check that constraint containing all features is equivalent to no constraint @@ -2491,7 +2470,7 @@ def inner_test(X, y, params, early_stopping_rounds): np.testing.assert_allclose(pred4, pred6) # test for regression - X, y = _load_boston(return_X_y=True) + X, y = load_boston(return_X_y=True) params = { 'objective': 'regression', 'verbose': -1, @@ -2504,7 +2483,7 @@ def inner_test(X, y, params, early_stopping_rounds): inner_test(X, y, params, early_stopping_rounds=None) # test for multi-class - X, y = _load_iris(return_X_y=True) + X, y = load_iris(return_X_y=True) params = { 'objective': 'multiclass', 'metric': 'multi_logloss', @@ -2518,7 +2497,7 @@ def inner_test(X, y, params, early_stopping_rounds): inner_test(X, y, params, early_stopping_rounds=None) # test for binary - X, y = _load_breast_cancer(return_X_y=True) + X, y = load_breast_cancer(return_X_y=True) params = { 'objective': 'binary', 'metric': 'binary_logloss', @@ -2546,6 +2525,7 @@ def test_average_precision_metric(self): sklearn_ap = average_precision_score(y, pred) self.assertAlmostEqual(ap, sklearn_ap) # test that average precision is 1 where model predicts perfectly + y = y.copy() y[:] = 1 lgb_X = lgb.Dataset(X, label=y) lgb.train(params, lgb_X, num_boost_round=1, valid_sets=[lgb_X], evals_result=res) diff --git a/tests/python_package_test/test_sklearn.py b/tests/python_package_test/test_sklearn.py index 678d135099fe..103518de0171 100644 --- a/tests/python_package_test/test_sklearn.py +++ b/tests/python_package_test/test_sklearn.py @@ -10,9 +10,7 @@ import numpy as np from sklearn import __version__ as sk_version from sklearn.base import clone -from sklearn.datasets import (load_boston, load_breast_cancer, load_digits, - load_iris, load_linnerud, load_svmlight_file, - make_multilabel_classification) +from sklearn.datasets import load_svmlight_file, make_multilabel_classification from sklearn.exceptions import SkipTestWarning from sklearn.metrics import log_loss, mean_squared_error from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, train_test_split @@ -22,7 +20,7 @@ check_parameters_default_constructible) from sklearn.utils.validation import check_is_fitted -from .utils import lru_cache +from .utils import load_boston, load_breast_cancer, load_digits, load_iris, load_linnerud, lru_cache decreasing_generator = itertools.count(0, -1) @@ -76,35 +74,10 @@ def multi_logloss(y_true, y_pred): return np.mean([-math.log(y_pred[i][y]) for i, y in enumerate(y_true)]) -@lru_cache(maxsize=None) -def _load_boston(**kwargs): - return load_boston(**kwargs) - - -@lru_cache(maxsize=None) -def _load_breast_cancer(**kwargs): - return load_breast_cancer(**kwargs) - - -@lru_cache(maxsize=None) -def _load_digits(**kwargs): - return load_digits(**kwargs) - - -@lru_cache(maxsize=None) -def _load_iris(**kwargs): - return load_iris(**kwargs) - - -@lru_cache(maxsize=None) -def _load_linnerud(**kwargs): - return load_linnerud(**kwargs) - - class TestSklearn(unittest.TestCase): def test_binary(self): - X, y = _load_breast_cancer(return_X_y=True) + X, y = load_breast_cancer(return_X_y=True) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42) gbm = lgb.LGBMClassifier(n_estimators=50, silent=True) gbm.fit(X_train, y_train, eval_set=[(X_test, y_test)], early_stopping_rounds=5, verbose=False) @@ -113,7 +86,7 @@ def test_binary(self): self.assertAlmostEqual(ret, gbm.evals_result_['valid_0']['binary_logloss'][gbm.best_iteration_ - 1], places=5) def test_regression(self): - X, y = _load_boston(return_X_y=True) + X, y = load_boston(return_X_y=True) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42) gbm = lgb.LGBMRegressor(n_estimators=50, silent=True) gbm.fit(X_train, y_train, eval_set=[(X_test, y_test)], early_stopping_rounds=5, verbose=False) @@ -122,7 +95,7 @@ def test_regression(self): self.assertAlmostEqual(ret, gbm.evals_result_['valid_0']['l2'][gbm.best_iteration_ - 1], places=5) def test_multiclass(self): - X, y = _load_digits(n_class=10, return_X_y=True) + X, y = load_digits(n_class=10, return_X_y=True) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42) gbm = lgb.LGBMClassifier(n_estimators=50, silent=True) gbm.fit(X_train, y_train, eval_set=[(X_test, y_test)], early_stopping_rounds=5, verbose=False) @@ -165,7 +138,7 @@ def test_xendcg(self): self.assertGreater(gbm.best_score_['valid_0']['ndcg@3'], 0.6253) def test_regression_with_custom_objective(self): - X, y = _load_boston(return_X_y=True) + X, y = load_boston(return_X_y=True) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42) gbm = lgb.LGBMRegressor(n_estimators=50, silent=True, objective=objective_ls) gbm.fit(X_train, y_train, eval_set=[(X_test, y_test)], early_stopping_rounds=5, verbose=False) @@ -174,7 +147,7 @@ def test_regression_with_custom_objective(self): self.assertAlmostEqual(ret, gbm.evals_result_['valid_0']['l2'][gbm.best_iteration_ - 1], places=5) def test_binary_classification_with_custom_objective(self): - X, y = _load_digits(n_class=2, return_X_y=True) + X, y = load_digits(n_class=2, return_X_y=True) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42) gbm = lgb.LGBMClassifier(n_estimators=50, silent=True, objective=logregobj) gbm.fit(X_train, y_train, eval_set=[(X_test, y_test)], early_stopping_rounds=5, verbose=False) @@ -186,7 +159,7 @@ def test_binary_classification_with_custom_objective(self): self.assertLess(ret, 0.05) def test_dart(self): - X, y = _load_boston(return_X_y=True) + X, y = load_boston(return_X_y=True) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42) gbm = lgb.LGBMRegressor(boosting_type='dart', n_estimators=50) gbm.fit(X_train, y_train) @@ -199,7 +172,7 @@ def test_dart(self): def test_stacking_classifier(self): from sklearn.ensemble import StackingClassifier - X, y = _load_iris(return_X_y=True) + X, y = load_iris(return_X_y=True) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42) classifiers = [('gbm1', lgb.LGBMClassifier(n_estimators=3)), ('gbm2', lgb.LGBMClassifier(n_estimators=3))] @@ -226,7 +199,7 @@ def test_stacking_classifier(self): def test_stacking_regressor(self): from sklearn.ensemble import StackingRegressor - X, y = _load_boston(return_X_y=True) + X, y = load_boston(return_X_y=True) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42) regressors = [('gbm1', lgb.LGBMRegressor(n_estimators=3)), ('gbm2', lgb.LGBMRegressor(n_estimators=3))] @@ -245,7 +218,7 @@ def test_stacking_regressor(self): self.assertEqual(len(reg.final_estimator_.feature_importances_), 15) def test_grid_search(self): - X, y = _load_iris(return_X_y=True) + X, y = load_iris(return_X_y=True) y = y.astype(str) # utilize label encoder at it's max power X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42) @@ -275,7 +248,7 @@ def test_grid_search(self): self.assertLessEqual(score, 1.) def test_random_search(self): - X, y = _load_iris(return_X_y=True) + X, y = load_iris(return_X_y=True) y = y.astype(str) # utilize label encoder at it's max power X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42) @@ -329,7 +302,7 @@ def test_multioutput_classifier(self): # sklearn < 0.23 does not have as_frame parameter @unittest.skipIf(sk_version < '0.23.0', 'scikit-learn version is less than 0.23') def test_multioutput_regressor(self): - bunch = _load_linnerud(as_frame=True) # returns a Bunch instance + bunch = load_linnerud(as_frame=True) # returns a Bunch instance X, y = bunch['data'], bunch['target'] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42) @@ -368,7 +341,7 @@ def test_classifier_chain(self): # sklearn < 0.23 does not have as_frame parameter @unittest.skipIf(sk_version < '0.23.0', 'scikit-learn version is less than 0.23') def test_regressor_chain(self): - bunch = _load_linnerud(as_frame=True) # returns a Bunch instance + bunch = load_linnerud(as_frame=True) # returns a Bunch instance X, y = bunch['data'], bunch['target'] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42) order = [2, 0, 1] @@ -385,7 +358,7 @@ def test_regressor_chain(self): self.assertIsInstance(regressor.booster_, lgb.Booster) def test_clone_and_property(self): - X, y = _load_boston(return_X_y=True) + X, y = load_boston(return_X_y=True) gbm = lgb.LGBMRegressor(n_estimators=10, silent=True) gbm.fit(X, y, verbose=False) @@ -393,7 +366,7 @@ def test_clone_and_property(self): self.assertIsInstance(gbm.booster_, lgb.Booster) self.assertIsInstance(gbm.feature_importances_, np.ndarray) - X, y = _load_digits(n_class=2, return_X_y=True) + X, y = load_digits(n_class=2, return_X_y=True) clf = lgb.LGBMClassifier(n_estimators=10, silent=True) clf.fit(X, y, verbose=False) self.assertListEqual(sorted(clf.classes_), [0, 1]) @@ -402,7 +375,7 @@ def test_clone_and_property(self): self.assertIsInstance(clf.feature_importances_, np.ndarray) def test_joblib(self): - X, y = _load_boston(return_X_y=True) + X, y = load_boston(return_X_y=True) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42) gbm = lgb.LGBMRegressor(n_estimators=10, objective=custom_asymmetric_obj, silent=True, importance_type='split') @@ -427,7 +400,7 @@ def test_joblib(self): np.testing.assert_allclose(pred_origin, pred_pickle) def test_random_state_object(self): - X, y = _load_iris(return_X_y=True) + X, y = load_iris(return_X_y=True) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42) state1 = np.random.RandomState(123) state2 = np.random.RandomState(123) @@ -460,14 +433,14 @@ def test_random_state_object(self): df1, df3) def test_feature_importances_single_leaf(self): - data = _load_iris(return_X_y=False) + data = load_iris(return_X_y=False) clf = lgb.LGBMClassifier(n_estimators=10) clf.fit(data.data, data.target) importances = clf.feature_importances_ self.assertEqual(len(importances), 4) def test_feature_importances_type(self): - data = _load_iris(return_X_y=False) + data = load_iris(return_X_y=False) clf = lgb.LGBMClassifier(n_estimators=10) clf.fit(data.data, data.target) clf.set_params(importance_type='split') @@ -591,7 +564,7 @@ def test_pandas_sparse(self): def test_predict(self): # With default params - iris = _load_iris(return_X_y=False) + iris = load_iris(return_X_y=False) X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, test_size=0.2, random_state=42) @@ -671,7 +644,7 @@ def test_predict(self): res_engine, res_sklearn_params) def test_evaluate_train_set(self): - X, y = _load_boston(return_X_y=True) + X, y = load_boston(return_X_y=True) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42) gbm = lgb.LGBMRegressor(n_estimators=10, silent=True) gbm.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_test, y_test)], verbose=False) @@ -684,7 +657,7 @@ def test_evaluate_train_set(self): self.assertIn('l2', gbm.evals_result_['valid_1']) def test_metrics(self): - X, y = _load_boston(return_X_y=True) + X, y = load_boston(return_X_y=True) params = {'n_estimators': 2, 'verbose': -1} params_fit = {'X': X, 'y': y, 'eval_set': (X, y), 'verbose': False} @@ -724,7 +697,7 @@ def test_metrics(self): self.assertIn('mape', gbm.evals_result_['training']) # non-default metric with multiple metrics in eval_metric for LGBMClassifier - X_classification, y_classification = _load_breast_cancer(return_X_y=True) + X_classification, y_classification = load_breast_cancer(return_X_y=True) params_classification = {'n_estimators': 2, 'verbose': -1, 'objective': 'binary', 'metric': 'binary_logloss'} params_fit_classification = {'X': X_classification, 'y': y_classification, @@ -907,7 +880,7 @@ def test_metrics(self): self.assertIn('mape', gbm.evals_result_['training']) self.assertIn('error', gbm.evals_result_['training']) - X, y = _load_digits(n_class=3, return_X_y=True) + X, y = load_digits(n_class=3, return_X_y=True) params_fit = {'X': X, 'y': y, 'eval_set': (X, y), 'verbose': False} # default metric and invalid binary metric is replaced with multiclass alternative @@ -934,7 +907,7 @@ def test_metrics(self): self.assertIn('multi_logloss', gbm.evals_result_['training']) self.assertIn('multi_error', gbm.evals_result_['training']) - X, y = _load_digits(n_class=2, return_X_y=True) + X, y = load_digits(n_class=2, return_X_y=True) params_fit = {'X': X, 'y': y, 'eval_set': (X, y), 'verbose': False} # default metric and invalid multiclass metric is replaced with binary alternative @@ -951,7 +924,7 @@ def test_metrics(self): def test_multiple_eval_metrics(self): - X, y = _load_breast_cancer(return_X_y=True) + X, y = load_breast_cancer(return_X_y=True) params = {'n_estimators': 2, 'verbose': -1, 'objective': 'binary', 'metric': 'binary_logloss'} params_fit = {'X': X, 'y': y, 'eval_set': (X, y), 'verbose': False} @@ -1030,7 +1003,7 @@ def fit_and_check(eval_set_names, metric_names, assumed_iteration, first_metric_ self.assertEqual(assumed_iteration if eval_set_name != 'training' else gbm.n_estimators, gbm.best_iteration_) - X, y = _load_boston(return_X_y=True) + X, y = load_boston(return_X_y=True) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) X_test1, X_test2, y_test1, y_test2 = train_test_split(X_test, y_test, test_size=0.5, random_state=72) params = {'n_estimators': 30, @@ -1111,7 +1084,7 @@ def fit_and_check(eval_set_names, metric_names, assumed_iteration, first_metric_ fit_and_check(['valid_0', 'valid_1'], ['l1', 'l2'], iter_min_l2, True) def test_class_weight(self): - X, y = _load_digits(n_class=10, return_X_y=True) + X, y = load_digits(n_class=10, return_X_y=True) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) y_train_str = y_train.astype('str') y_test_str = y_test.astype('str') @@ -1145,7 +1118,7 @@ def test_class_weight(self): gbm_str.evals_result_[eval_set][metric]) def test_continue_training_with_model(self): - X, y = _load_digits(n_class=3, return_X_y=True) + X, y = load_digits(n_class=3, return_X_y=True) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42) init_gbm = lgb.LGBMClassifier(n_estimators=5).fit(X_train, y_train, eval_set=(X_test, y_test), verbose=False) @@ -1160,7 +1133,7 @@ def test_continue_training_with_model(self): # sklearn < 0.22 requires passing "attributes" argument @unittest.skipIf(sk_version < '0.22.0', 'scikit-learn version is less than 0.22') def test_check_is_fitted(self): - X, y = _load_digits(n_class=2, return_X_y=True) + X, y = load_digits(n_class=2, return_X_y=True) est = lgb.LGBMModel(n_estimators=5, objective="binary") clf = lgb.LGBMClassifier(n_estimators=5) reg = lgb.LGBMRegressor(n_estimators=5) diff --git a/tests/python_package_test/utils.py b/tests/python_package_test/utils.py index 6c3e4601a1dd..f0b160d60dfb 100644 --- a/tests/python_package_test/utils.py +++ b/tests/python_package_test/utils.py @@ -1,4 +1,6 @@ # coding: utf-8 +import sklearn.datasets + try: from functools import lru_cache except ImportError: @@ -16,3 +18,28 @@ def wrapper(*args, **kwargs): return cache[arg_key] return wrapper return _lru_wrapper + + +@lru_cache(maxsize=None) +def load_boston(**kwargs): + return sklearn.datasets.load_boston(**kwargs) + + +@lru_cache(maxsize=None) +def load_breast_cancer(**kwargs): + return sklearn.datasets.load_breast_cancer(**kwargs) + + +@lru_cache(maxsize=None) +def load_digits(**kwargs): + return sklearn.datasets.load_digits(**kwargs) + + +@lru_cache(maxsize=None) +def load_iris(**kwargs): + return sklearn.datasets.load_iris(**kwargs) + + +@lru_cache(maxsize=None) +def load_linnerud(**kwargs): + return sklearn.datasets.load_linnerud(**kwargs) From dfb0fd35cdea555d8b8aac800fbb01528fbe0ca4 Mon Sep 17 00:00:00 2001 From: James Lamb Date: Thu, 29 Oct 2020 00:03:50 -0500 Subject: [PATCH 14/14] cache in test_plotting --- tests/python_package_test/test_basic.py | 2 +- tests/python_package_test/test_engine.py | 2 +- tests/python_package_test/test_plotting.py | 3 ++- tests/python_package_test/test_sklearn.py | 2 +- 4 files changed, 5 insertions(+), 4 deletions(-) diff --git a/tests/python_package_test/test_basic.py b/tests/python_package_test/test_basic.py index f53933021b14..a0ce5b8f8b66 100644 --- a/tests/python_package_test/test_basic.py +++ b/tests/python_package_test/test_basic.py @@ -10,7 +10,7 @@ from sklearn.datasets import dump_svmlight_file, load_svmlight_file from sklearn.model_selection import train_test_split -from .utils import load_breast_cancer, lru_cache +from .utils import load_breast_cancer class TestBasic(unittest.TestCase): diff --git a/tests/python_package_test/test_engine.py b/tests/python_package_test/test_engine.py index 3cb4c7ff55c3..de8689fd3ea5 100644 --- a/tests/python_package_test/test_engine.py +++ b/tests/python_package_test/test_engine.py @@ -19,7 +19,7 @@ except ImportError: import pickle -from .utils import load_boston, load_breast_cancer, load_digits, load_iris, lru_cache +from .utils import load_boston, load_breast_cancer, load_digits, load_iris decreasing_generator = itertools.count(0, -1) diff --git a/tests/python_package_test/test_plotting.py b/tests/python_package_test/test_plotting.py index 786b79760910..293012348ac3 100644 --- a/tests/python_package_test/test_plotting.py +++ b/tests/python_package_test/test_plotting.py @@ -3,7 +3,6 @@ import lightgbm as lgb from lightgbm.compat import MATPLOTLIB_INSTALLED, GRAPHVIZ_INSTALLED -from sklearn.datasets import load_breast_cancer from sklearn.model_selection import train_test_split if MATPLOTLIB_INSTALLED: @@ -12,6 +11,8 @@ if GRAPHVIZ_INSTALLED: import graphviz +from .utils import load_breast_cancer + class TestBasic(unittest.TestCase): diff --git a/tests/python_package_test/test_sklearn.py b/tests/python_package_test/test_sklearn.py index 103518de0171..623f83a517a5 100644 --- a/tests/python_package_test/test_sklearn.py +++ b/tests/python_package_test/test_sklearn.py @@ -20,7 +20,7 @@ check_parameters_default_constructible) from sklearn.utils.validation import check_is_fitted -from .utils import load_boston, load_breast_cancer, load_digits, load_iris, load_linnerud, lru_cache +from .utils import load_boston, load_breast_cancer, load_digits, load_iris, load_linnerud decreasing_generator = itertools.count(0, -1)