novelty score for GASearchCV

rodrigo-arenas · Sep 12, 2024 · d8ae6a9 · d8ae6a9
1 parent a5006fa
commit d8ae6a9
Show file tree

Hide file tree

Showing 3 changed files with 58 additions and 7 deletions.
diff --git a/docs/release_notes.rst b/docs/release_notes.rst
@@ -34,6 +34,15 @@ Features:
   allowing the algorithm to focus on refining known good solutions while still exploring new
   areas of the hyperparameter space. If set to ``None``, the entire population will be initialized
   randomly.
+* Introduced a **novelty search strategy** to the `GASearchCV` class. This strategy rewards solutions that are more distinct from others
+  in the population by incorporating a **novelty score** into the fitness evaluation. The novelty score encourages exploration and promotes diversity,
+  reducing the risk of premature convergence to local optima.
+
+       - **Novelty Score**: Calculated based on the distance between an individual and its nearest neighbors in the population.
+         Individuals with higher novelty scores are more distinct from the rest of the population.
+       - **Fitness Evaluation**: The overall fitness is now a combination of the traditional performance score and the novelty score,
+         allowing the algorithm to balance between exploiting known good solutions and exploring new, diverse ones.
+       - **Improved Exploration**: This strategy helps explore new areas of the hyperparameter space, increasing the likelihood of discovering better solutions and avoiding local optima.
 
 ^^^^^^^^^^^^
 API Changes:

diff --git a/sklearn_genetic/genetic_search.py b/sklearn_genetic/genetic_search.py
@@ -28,7 +28,7 @@
     create_feature_selection_cv_results_,
 )
 from .utils.random import weighted_bool_individual
-from .utils.tools import cxUniform, mutFlipBit
+from .utils.tools import cxUniform, mutFlipBit, novelty_scorer
 
 
 class GASearchCV(BaseSearchCV):
@@ -308,7 +308,7 @@ def _register(self):
         """
         self.toolbox = base.Toolbox()
 
-        self.creator.create("FitnessMax", base.Fitness, weights=[self.criteria_sign])
+        self.creator.create("FitnessMax", base.Fitness, weights=[self.criteria_sign, 1.0])
         self.creator.create("Individual", list, fitness=creator.FitnessMax)
 
         attributes = []
@@ -352,10 +352,10 @@ def _register(self):
         self._hof = tools.HallOfFame(self.keep_top_k)
 
         self._stats = tools.Statistics(lambda ind: ind.fitness.values)
-        self._stats.register("fitness", np.mean)
-        self._stats.register("fitness_std", np.std)
-        self._stats.register("fitness_max", np.max)
-        self._stats.register("fitness_min", np.min)
+        self._stats.register("fitness", np.mean, axis=0)
+        self._stats.register("fitness_std", np.std, axis=0)
+        self._stats.register("fitness_max", np.max, axis=0)
+        self._stats.register("fitness_min", np.min, axis=0)
 
         self.logbook = tools.Logbook()
 
@@ -454,6 +454,8 @@ def evaluate(self, individual):
         cv_scores = cv_results[f"test_{self.refit_metric}"]
         score = np.mean(cv_scores)
 
+        novelty_score = novelty_scorer(individual, self._pop)
+
         # Uses the log config to save in remote log server (e.g MLflow)
         if self.log_config is not None:
             self.log_config.create_run(
@@ -480,7 +482,7 @@ def evaluate(self, individual):
         # Log the hyperparameters and the cv-score
         self.logbook.record(parameters=current_generation_params)
 
-        fitness_result = [score]
+        fitness_result = [score, novelty_score]
 
         if self.use_cache:
             # Store the fitness result and the current generation parameters in the cache

diff --git a/sklearn_genetic/utils/tools.py b/sklearn_genetic/utils/tools.py
@@ -1,4 +1,5 @@
 import random
+import numpy as np
 
 
 def mutFlipBit(individual, indpb):
@@ -67,3 +68,42 @@ def check_bool_individual(individual):
         individual[index] = 1
 
     return individual
+
+
+def novelty_scorer(individual, population, k=15):
+    """
+    Calculate novelty score for an individual based on its distance from other individuals in the population.
+
+    Parameters
+    ----------
+    individual: Individual object
+        The individual (set of hyperparameters) that is being evaluated.
+
+    population: List[Individual]
+        The current population of individuals.
+
+    k: int, default=15
+        The number of nearest neighbors to consider for the novelty calculation.
+
+    Returns
+    -------
+    novelty_score: float
+        The novelty score for the individual.
+    """
+    distances = []
+
+    # Calculate distances between the individual and every other individual in the population
+    for other in population:
+        if other != individual:
+            # Here we use Hamming distance to measure difference
+            distance = sum(i != o for i, o in zip(individual, other))
+            distances.append(distance)
+
+    # Sort the distances and take the average of the k nearest neighbors
+    distances = sorted(distances)
+    k_min = min(k, len(population))
+    nearest_distances = distances[:k_min]
+
+    # Novelty score is the average distance to the k-nearest neighbors
+    novelty_score = np.mean(nearest_distances) if nearest_distances else 0
+    return novelty_score