22
22
from sklearn .ensemble import RandomForestClassifier , RandomForestRegressor
23
23
from sklearn .ensemble import ExtraTreesClassifier , ExtraTreesRegressor
24
24
from sklearn .ensemble import GradientBoostingClassifier , GradientBoostingRegressor
25
+ from sklearn .model_selection import KFold
25
26
from xgboost import XGBClassifier , XGBRegressor
26
27
27
28
from feature .base import _BaseDispatcher , _BaseSupervisedSelector , _BaseUnsupervisedSelector
@@ -475,9 +476,11 @@ def benchmark(selectors: Dict[str, Union[SelectionMethod.Correlation,
475
476
SelectionMethod .Variance ]],
476
477
data : pd .DataFrame ,
477
478
labels : Optional [pd .Series ] = None ,
479
+ cv : Optional [int ] = None ,
478
480
output_filename : Optional [str ] = None ,
479
481
drop_zero_variance_features : Optional [bool ] = True ,
480
- verbose : bool = False ) \
482
+ verbose : bool = False ,
483
+ seed : int = Constants .default_seed ) \
481
484
-> Tuple [pd .DataFrame , pd .DataFrame , pd .DataFrame ]:
482
485
"""
483
486
Benchmark with a given set of feature selectors.
@@ -495,13 +498,90 @@ def benchmark(selectors: Dict[str, Union[SelectionMethod.Correlation,
495
498
Data of shape (n_samples, n_features) used for feature selection.
496
499
labels: pd.Series, optional (default=None)
497
500
The target values (class labels in classification, real numbers in regression).
501
+ cv: int, optional (default=None)
502
+ Number of folds to use for cross-validation.
498
503
output_filename: str, optional (default=None)
499
504
If not None, benchmarking output is saved.
500
505
If file exists, results are appended, otherwise file is created.
501
506
drop_zero_variance_features: bool, optional (default=True)
502
507
Whether to drop features with zero variance before running feature selector methods or not.
503
508
verbose: bool, optional (default=False)
504
509
Whether to print progress messages or not.
510
+ seed: int, optional (default=Constants.default_seed)
511
+ The random seed to initialize the random number generator.
512
+
513
+ Returns
514
+ -------
515
+ Tuple of data frames with scores, selected features and runtime for each method.
516
+ If cv is not None, the data frames will contain the concatenated results from each fold.
517
+ """
518
+
519
+ check_true (selectors is not None , ValueError ("Benchmark selectors cannot be none." ))
520
+ check_true (data is not None , ValueError ("Benchmark data cannot be none." ))
521
+
522
+ if cv is None :
523
+ return _bench (selectors = selectors ,
524
+ data = data ,
525
+ labels = labels ,
526
+ output_filename = output_filename ,
527
+ drop_zero_variance_features = drop_zero_variance_features ,
528
+ verbose = verbose )
529
+ else :
530
+
531
+ # Create K-Fold object
532
+ kf = KFold (n_splits = cv , shuffle = True , random_state = seed )
533
+
534
+ # Initialize variables
535
+ t0 = time ()
536
+ train_labels , test_labels = None , None
537
+ score_df , selected_df , runtime_df = pd .DataFrame (), pd .DataFrame (), pd .DataFrame ()
538
+
539
+ # Split data into cv-folds and run _bench for each fold
540
+ if verbose :
541
+ print ("\n >>> Running" )
542
+ for fold , (train_index , _ ) in enumerate (kf .split (data )):
543
+
544
+ if verbose :
545
+ print ("\t Fold" , fold , "..." )
546
+
547
+ # Split data, labels into folds
548
+ train_data = data .iloc [train_index ]
549
+ if labels is not None :
550
+ train_labels = labels .iloc [train_index ]
551
+
552
+ # Run benchmark
553
+ score_cv_df , selected_cv_df , runtime_cv_df = _bench (selectors = selectors ,
554
+ data = train_data ,
555
+ labels = train_labels ,
556
+ output_filename = output_filename ,
557
+ drop_zero_variance_features = drop_zero_variance_features ,
558
+ verbose = False )
559
+
560
+ # Concatenate data frames
561
+ score_df = pd .concat ((score_df , score_cv_df ))
562
+ selected_df = pd .concat ((selected_df , selected_cv_df ))
563
+ runtime_df = pd .concat ((runtime_df , runtime_cv_df ))
564
+
565
+ if verbose :
566
+ print (f"<<< Done! Time taken: { (time () - t0 ) / 60 :.2f} minutes" )
567
+
568
+ return score_df , selected_df , runtime_df
569
+
570
+
571
+ def _bench (selectors : Dict [str , Union [SelectionMethod .Correlation ,
572
+ SelectionMethod .Linear ,
573
+ SelectionMethod .TreeBased ,
574
+ SelectionMethod .Statistical ,
575
+ SelectionMethod .Variance ]],
576
+ data : pd .DataFrame ,
577
+ labels : Optional [pd .Series ] = None ,
578
+ output_filename : Optional [str ] = None ,
579
+ drop_zero_variance_features : Optional [bool ] = True ,
580
+ verbose : bool = False ) \
581
+ -> Tuple [pd .DataFrame , pd .DataFrame , pd .DataFrame ]:
582
+ """
583
+ Benchmark with a given set of feature selectors.
584
+ Return a tuple of data frames with scores, runtime and selected features for each method.
505
585
506
586
Returns
507
587
-------
@@ -552,7 +632,7 @@ def benchmark(selectors: Dict[str, Union[SelectionMethod.Correlation,
552
632
if verbose :
553
633
print (f"<<< Done! Time taken: { (time () - t0 ) / 60 :.2f} minutes" )
554
634
555
- # Convert to series
635
+ # Format
556
636
runtime_df = pd .Series (method_to_runtime ).to_frame ("runtime" ).rename_axis ("method" ).reset_index ()
557
637
558
638
return score_df , selected_df , runtime_df
@@ -561,15 +641,19 @@ def benchmark(selectors: Dict[str, Union[SelectionMethod.Correlation,
561
641
def calculate_statistics (scores : pd .DataFrame ,
562
642
selected : pd .DataFrame ,
563
643
columns : Optional [list ] = None ,
564
- ignore_constant : Optional [bool ] = True ):
565
- """Calculate statistics for each feature using scores/selections from list of methods.
644
+ ignore_constant : Optional [bool ] = True ) -> pd .DataFrame :
645
+ """
646
+ Calculate statistics for each feature using scores/selections from list of methods.
647
+ Returns data frame with calculated statistics for each feature.
566
648
567
649
Parameters
568
650
----------
569
651
scores: pd.DataFrame
570
652
Data frame with scores for each feature (index) and selector (columns).
653
+ Each feature could have multiple rows from different cross-validation folds.
571
654
selected: pd.DataFrame
572
655
Data frame with selection flag for each feature (index) and selector (columns).
656
+ Each feature could have multiple rows from different cross-validation folds.
573
657
columns: list (default=None)
574
658
List of methods (columns) to include in statistics.
575
659
If None, all methods (columns) will be used.
@@ -584,9 +668,9 @@ def calculate_statistics(scores: pd.DataFrame,
584
668
check_true (isinstance (scores , pd .DataFrame ), ValueError ("scores must be a data frame." ))
585
669
check_true (isinstance (selected , pd .DataFrame ), ValueError ("selection must be a data frame." ))
586
670
check_true (scores .shape == selected .shape , ValueError ("Shapes of scores and selected data frames must match." ))
587
- check_true (len (scores .index . intersection ( selected . index )) == selected .shape [ 0 ] ,
671
+ check_true (np . all (scores .index == selected .index ) ,
588
672
ValueError ("Index of score and selection data frames must match." ))
589
- check_true (len (scores .columns . intersection ( selected . columns )) == selected .shape [ 1 ] ,
673
+ check_true (np . all (scores .columns == selected .columns ) ,
590
674
ValueError ("Columns of score and selection data frames must match." ))
591
675
592
676
# Get columns to use
@@ -597,25 +681,25 @@ def calculate_statistics(scores: pd.DataFrame,
597
681
scores_df = scores [columns ].copy ()
598
682
selected_df = selected [columns ].copy ()
599
683
684
+ # Group by feature for CV results
685
+ scores_df = scores_df .groupby (scores_df .index ).mean ()
686
+ selected_df = selected_df .groupby (selected_df .index ).mean ()
687
+
600
688
# Drop methods with constant scores
601
689
if ignore_constant :
602
690
mask = ~ np .isclose (np .var (scores_df , axis = 0 ), 0 )
603
691
scores_df = scores_df .loc [:, mask ]
604
692
selected_df = selected_df .loc [:, mask ]
605
693
606
- # Sort by index
607
- scores_df .sort_index (inplace = True )
608
- selected_df .sort_index (inplace = True )
609
-
610
694
# Calculate statistics
611
- stats_df = pd .DataFrame (index = scores .index )
612
- stats_df ["_score_mean " ] = scores_df .mean (axis = 1 )
613
- stats_df ["_score_mean_norm " ] = normalize_columns (scores_df ).mean (axis = 1 )
614
- stats_df ["_selection_freq " ] = selected_df .sum (axis = 1 )
615
- stats_df ["_selection_freq_norm " ] = normalize_columns (selected_df ).sum (axis = 1 )
695
+ stats_df = pd .DataFrame (index = scores_df .index )
696
+ stats_df ["score_mean " ] = scores_df .mean (axis = 1 )
697
+ stats_df ["score_mean_norm " ] = normalize_columns (scores_df ).mean (axis = 1 )
698
+ stats_df ["selection_freq " ] = selected_df .sum (axis = 1 )
699
+ stats_df ["selection_freq_norm " ] = normalize_columns (selected_df ).sum (axis = 1 )
616
700
617
701
# Sort
618
- stats_df .sort_values (by = "_score_mean_norm " , ascending = False , inplace = True )
702
+ stats_df .sort_values (by = "score_mean_norm " , ascending = False , inplace = True )
619
703
620
704
return stats_df
621
705
@@ -632,6 +716,7 @@ def plot_importance(scores: pd.DataFrame,
632
716
----------
633
717
scores: pd.DataFrame
634
718
Data frame with scores for each feature (index) and method (columns).
719
+ Each feature could have multiple rows from different cross-validation folds.
635
720
columns: list (default=None)
636
721
List of methods (columns) to include in statistics.
637
722
If None, all methods (columns) will be used.
@@ -663,6 +748,9 @@ def plot_importance(scores: pd.DataFrame,
663
748
df = scores [columns ].copy ()
664
749
df .fillna (0 , inplace = True )
665
750
751
+ # Group by feature for CV results
752
+ df = df .groupby (df .index ).mean ()
753
+
666
754
# Get normalized scores such that scores for each method sums to 1
667
755
if normalize :
668
756
df = normalize_columns (df )
0 commit comments