Skip to content

Commit 98179a9

Browse files
authored
Merge pull request #3 from fidelity/feature/cv
feature/cv
2 parents 24300df + e523001 commit 98179a9

File tree

7 files changed

+249
-85
lines changed

7 files changed

+249
-85
lines changed

CHANGELOG.txt

+6
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,12 @@
22
CHANGELOG
33
=========
44

5+
-------------------------------------------------------------------------------
6+
March, 23, 2021 1.0.1
7+
-------------------------------------------------------------------------------
8+
9+
- Add cross-validation (cv) capability to benchmark function.
10+
511
-------------------------------------------------------------------------------
612
February, 1, 2021 1.0.0
713
-------------------------------------------------------------------------------

README.md

+14-2
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ The library provides:
99

1010
* Automated task detection. No need to know what feature selection method works with what machine learning task
1111

12-
* Benchmarking with multiple selectors
12+
* Benchmarking multiple selectors using cross-validation
1313

1414
* Inspection of results and feature importance
1515

@@ -91,7 +91,7 @@ selectors = {
9191
}
9292

9393
# Benchmark
94-
score_df, selected_df, runtime_df = benchmark(selectors, data, label)
94+
score_df, selected_df, runtime_df = benchmark(selectors, data, label, cv=5)
9595
print(score_df, "\n\n", selected_df, "\n\n", runtime_df)
9696

9797
# Get benchmark statistics by feature
@@ -125,6 +125,18 @@ plot_importance(df)
125125

126126
Selective is available to install as `pip install selective`.
127127

128+
## Source
129+
130+
Alternatively, you can build a wheel package on your platform from scratch using the source code:
131+
132+
```bash
133+
git clone https://github.com/fidelity/selective.git
134+
cd selective
135+
pip install setuptools wheel # if wheel is not installed
136+
python setup.py sdist bdist_wheel
137+
pip install dist/selective-X.X.X-py3-none-any.whl
138+
```
139+
128140
## Support
129141

130142
Please submit bug reports and feature requests as [Issues](https://github.com/fidelity/selective/issues).

dist/selective-1.0.0-py3-none-any.whl

-33.2 KB
Binary file not shown.

feature/_version.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -2,4 +2,4 @@
22
# Copyright FMR LLC <[email protected]>
33
# SPDX-License-Identifier: GNU GPLv3
44

5-
__version__ = "1.0.0"
5+
__version__ = "1.0.1"

feature/selector.py

+104-16
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
2323
from sklearn.ensemble import ExtraTreesClassifier, ExtraTreesRegressor
2424
from sklearn.ensemble import GradientBoostingClassifier, GradientBoostingRegressor
25+
from sklearn.model_selection import KFold
2526
from xgboost import XGBClassifier, XGBRegressor
2627

2728
from feature.base import _BaseDispatcher, _BaseSupervisedSelector, _BaseUnsupervisedSelector
@@ -475,9 +476,11 @@ def benchmark(selectors: Dict[str, Union[SelectionMethod.Correlation,
475476
SelectionMethod.Variance]],
476477
data: pd.DataFrame,
477478
labels: Optional[pd.Series] = None,
479+
cv: Optional[int] = None,
478480
output_filename: Optional[str] = None,
479481
drop_zero_variance_features: Optional[bool] = True,
480-
verbose: bool = False) \
482+
verbose: bool = False,
483+
seed: int = Constants.default_seed) \
481484
-> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
482485
"""
483486
Benchmark with a given set of feature selectors.
@@ -495,13 +498,90 @@ def benchmark(selectors: Dict[str, Union[SelectionMethod.Correlation,
495498
Data of shape (n_samples, n_features) used for feature selection.
496499
labels: pd.Series, optional (default=None)
497500
The target values (class labels in classification, real numbers in regression).
501+
cv: int, optional (default=None)
502+
Number of folds to use for cross-validation.
498503
output_filename: str, optional (default=None)
499504
If not None, benchmarking output is saved.
500505
If file exists, results are appended, otherwise file is created.
501506
drop_zero_variance_features: bool, optional (default=True)
502507
Whether to drop features with zero variance before running feature selector methods or not.
503508
verbose: bool, optional (default=False)
504509
Whether to print progress messages or not.
510+
seed: int, optional (default=Constants.default_seed)
511+
The random seed to initialize the random number generator.
512+
513+
Returns
514+
-------
515+
Tuple of data frames with scores, selected features and runtime for each method.
516+
If cv is not None, the data frames will contain the concatenated results from each fold.
517+
"""
518+
519+
check_true(selectors is not None, ValueError("Benchmark selectors cannot be none."))
520+
check_true(data is not None, ValueError("Benchmark data cannot be none."))
521+
522+
if cv is None:
523+
return _bench(selectors=selectors,
524+
data=data,
525+
labels=labels,
526+
output_filename=output_filename,
527+
drop_zero_variance_features=drop_zero_variance_features,
528+
verbose=verbose)
529+
else:
530+
531+
# Create K-Fold object
532+
kf = KFold(n_splits=cv, shuffle=True, random_state=seed)
533+
534+
# Initialize variables
535+
t0 = time()
536+
train_labels, test_labels = None, None
537+
score_df, selected_df, runtime_df = pd.DataFrame(), pd.DataFrame(), pd.DataFrame()
538+
539+
# Split data into cv-folds and run _bench for each fold
540+
if verbose:
541+
print("\n>>> Running")
542+
for fold, (train_index, _) in enumerate(kf.split(data)):
543+
544+
if verbose:
545+
print("\tFold", fold, "...")
546+
547+
# Split data, labels into folds
548+
train_data = data.iloc[train_index]
549+
if labels is not None:
550+
train_labels = labels.iloc[train_index]
551+
552+
# Run benchmark
553+
score_cv_df, selected_cv_df, runtime_cv_df = _bench(selectors=selectors,
554+
data=train_data,
555+
labels=train_labels,
556+
output_filename=output_filename,
557+
drop_zero_variance_features=drop_zero_variance_features,
558+
verbose=False)
559+
560+
# Concatenate data frames
561+
score_df = pd.concat((score_df, score_cv_df))
562+
selected_df = pd.concat((selected_df, selected_cv_df))
563+
runtime_df = pd.concat((runtime_df, runtime_cv_df))
564+
565+
if verbose:
566+
print(f"<<< Done! Time taken: {(time() - t0) / 60:.2f} minutes")
567+
568+
return score_df, selected_df, runtime_df
569+
570+
571+
def _bench(selectors: Dict[str, Union[SelectionMethod.Correlation,
572+
SelectionMethod.Linear,
573+
SelectionMethod.TreeBased,
574+
SelectionMethod.Statistical,
575+
SelectionMethod.Variance]],
576+
data: pd.DataFrame,
577+
labels: Optional[pd.Series] = None,
578+
output_filename: Optional[str] = None,
579+
drop_zero_variance_features: Optional[bool] = True,
580+
verbose: bool = False) \
581+
-> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
582+
"""
583+
Benchmark with a given set of feature selectors.
584+
Return a tuple of data frames with scores, runtime and selected features for each method.
505585
506586
Returns
507587
-------
@@ -552,7 +632,7 @@ def benchmark(selectors: Dict[str, Union[SelectionMethod.Correlation,
552632
if verbose:
553633
print(f"<<< Done! Time taken: {(time() - t0) / 60:.2f} minutes")
554634

555-
# Convert to series
635+
# Format
556636
runtime_df = pd.Series(method_to_runtime).to_frame("runtime").rename_axis("method").reset_index()
557637

558638
return score_df, selected_df, runtime_df
@@ -561,15 +641,19 @@ def benchmark(selectors: Dict[str, Union[SelectionMethod.Correlation,
561641
def calculate_statistics(scores: pd.DataFrame,
562642
selected: pd.DataFrame,
563643
columns: Optional[list] = None,
564-
ignore_constant: Optional[bool] = True):
565-
"""Calculate statistics for each feature using scores/selections from list of methods.
644+
ignore_constant: Optional[bool] = True) -> pd.DataFrame:
645+
"""
646+
Calculate statistics for each feature using scores/selections from list of methods.
647+
Returns data frame with calculated statistics for each feature.
566648
567649
Parameters
568650
----------
569651
scores: pd.DataFrame
570652
Data frame with scores for each feature (index) and selector (columns).
653+
Each feature could have multiple rows from different cross-validation folds.
571654
selected: pd.DataFrame
572655
Data frame with selection flag for each feature (index) and selector (columns).
656+
Each feature could have multiple rows from different cross-validation folds.
573657
columns: list (default=None)
574658
List of methods (columns) to include in statistics.
575659
If None, all methods (columns) will be used.
@@ -584,9 +668,9 @@ def calculate_statistics(scores: pd.DataFrame,
584668
check_true(isinstance(scores, pd.DataFrame), ValueError("scores must be a data frame."))
585669
check_true(isinstance(selected, pd.DataFrame), ValueError("selection must be a data frame."))
586670
check_true(scores.shape == selected.shape, ValueError("Shapes of scores and selected data frames must match."))
587-
check_true(len(scores.index.intersection(selected.index)) == selected.shape[0],
671+
check_true(np.all(scores.index == selected.index),
588672
ValueError("Index of score and selection data frames must match."))
589-
check_true(len(scores.columns.intersection(selected.columns)) == selected.shape[1],
673+
check_true(np.all(scores.columns == selected.columns),
590674
ValueError("Columns of score and selection data frames must match."))
591675

592676
# Get columns to use
@@ -597,25 +681,25 @@ def calculate_statistics(scores: pd.DataFrame,
597681
scores_df = scores[columns].copy()
598682
selected_df = selected[columns].copy()
599683

684+
# Group by feature for CV results
685+
scores_df = scores_df.groupby(scores_df.index).mean()
686+
selected_df = selected_df.groupby(selected_df.index).mean()
687+
600688
# Drop methods with constant scores
601689
if ignore_constant:
602690
mask = ~np.isclose(np.var(scores_df, axis=0), 0)
603691
scores_df = scores_df.loc[:, mask]
604692
selected_df = selected_df.loc[:, mask]
605693

606-
# Sort by index
607-
scores_df.sort_index(inplace=True)
608-
selected_df.sort_index(inplace=True)
609-
610694
# Calculate statistics
611-
stats_df = pd.DataFrame(index=scores.index)
612-
stats_df["_score_mean"] = scores_df.mean(axis=1)
613-
stats_df["_score_mean_norm"] = normalize_columns(scores_df).mean(axis=1)
614-
stats_df["_selection_freq"] = selected_df.sum(axis=1)
615-
stats_df["_selection_freq_norm"] = normalize_columns(selected_df).sum(axis=1)
695+
stats_df = pd.DataFrame(index=scores_df.index)
696+
stats_df["score_mean"] = scores_df.mean(axis=1)
697+
stats_df["score_mean_norm"] = normalize_columns(scores_df).mean(axis=1)
698+
stats_df["selection_freq"] = selected_df.sum(axis=1)
699+
stats_df["selection_freq_norm"] = normalize_columns(selected_df).sum(axis=1)
616700

617701
# Sort
618-
stats_df.sort_values(by="_score_mean_norm", ascending=False, inplace=True)
702+
stats_df.sort_values(by="score_mean_norm", ascending=False, inplace=True)
619703

620704
return stats_df
621705

@@ -632,6 +716,7 @@ def plot_importance(scores: pd.DataFrame,
632716
----------
633717
scores: pd.DataFrame
634718
Data frame with scores for each feature (index) and method (columns).
719+
Each feature could have multiple rows from different cross-validation folds.
635720
columns: list (default=None)
636721
List of methods (columns) to include in statistics.
637722
If None, all methods (columns) will be used.
@@ -663,6 +748,9 @@ def plot_importance(scores: pd.DataFrame,
663748
df = scores[columns].copy()
664749
df.fillna(0, inplace=True)
665750

751+
# Group by feature for CV results
752+
df = df.groupby(df.index).mean()
753+
666754
# Get normalized scores such that scores for each method sums to 1
667755
if normalize:
668756
df = normalize_columns(df)

feature/tree_based.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -50,14 +50,14 @@ def dispatch_model(self, labels: pd.Series, *args):
5050
# Custom estimator should be compatible with the task
5151
if "classification_" in task_str:
5252
if isinstance(self.estimator, CatBoost):
53-
if self.estimator._estimator_type is not 'classifier':
53+
if self.estimator._estimator_type != 'classifier':
5454
raise TypeError(str(self.estimator) + " cannot be used for task: " + task_str)
5555
else:
5656
if not isinstance(self.estimator, ClassifierMixin):
5757
raise TypeError(str(self.estimator) + " cannot be used for task: " + task_str)
5858
else:
5959
if isinstance(self.estimator, CatBoost):
60-
if self.estimator._estimator_type is not 'regressor':
60+
if self.estimator._estimator_type != 'regressor':
6161
raise TypeError(str(self.estimator) + " cannot be used for task: " + task_str)
6262
else:
6363
if not isinstance(self.estimator, RegressorMixin):

0 commit comments

Comments
 (0)