Skip to content

Commit

Permalink
novelty score for GASearchCV
Browse files Browse the repository at this point in the history
  • Loading branch information
rodrigo-arenas committed Sep 12, 2024
1 parent a5006fa commit d8ae6a9
Show file tree
Hide file tree
Showing 3 changed files with 58 additions and 7 deletions.
9 changes: 9 additions & 0 deletions docs/release_notes.rst
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,15 @@ Features:
allowing the algorithm to focus on refining known good solutions while still exploring new
areas of the hyperparameter space. If set to ``None``, the entire population will be initialized
randomly.
* Introduced a **novelty search strategy** to the `GASearchCV` class. This strategy rewards solutions that are more distinct from others
in the population by incorporating a **novelty score** into the fitness evaluation. The novelty score encourages exploration and promotes diversity,
reducing the risk of premature convergence to local optima.

- **Novelty Score**: Calculated based on the distance between an individual and its nearest neighbors in the population.
Individuals with higher novelty scores are more distinct from the rest of the population.
- **Fitness Evaluation**: The overall fitness is now a combination of the traditional performance score and the novelty score,
allowing the algorithm to balance between exploiting known good solutions and exploring new, diverse ones.
- **Improved Exploration**: This strategy helps explore new areas of the hyperparameter space, increasing the likelihood of discovering better solutions and avoiding local optima.

^^^^^^^^^^^^
API Changes:
Expand Down
16 changes: 9 additions & 7 deletions sklearn_genetic/genetic_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@
create_feature_selection_cv_results_,
)
from .utils.random import weighted_bool_individual
from .utils.tools import cxUniform, mutFlipBit
from .utils.tools import cxUniform, mutFlipBit, novelty_scorer


class GASearchCV(BaseSearchCV):
Expand Down Expand Up @@ -308,7 +308,7 @@ def _register(self):
"""
self.toolbox = base.Toolbox()

self.creator.create("FitnessMax", base.Fitness, weights=[self.criteria_sign])
self.creator.create("FitnessMax", base.Fitness, weights=[self.criteria_sign, 1.0])
self.creator.create("Individual", list, fitness=creator.FitnessMax)

attributes = []
Expand Down Expand Up @@ -352,10 +352,10 @@ def _register(self):
self._hof = tools.HallOfFame(self.keep_top_k)

self._stats = tools.Statistics(lambda ind: ind.fitness.values)
self._stats.register("fitness", np.mean)
self._stats.register("fitness_std", np.std)
self._stats.register("fitness_max", np.max)
self._stats.register("fitness_min", np.min)
self._stats.register("fitness", np.mean, axis=0)
self._stats.register("fitness_std", np.std, axis=0)
self._stats.register("fitness_max", np.max, axis=0)
self._stats.register("fitness_min", np.min, axis=0)

self.logbook = tools.Logbook()

Expand Down Expand Up @@ -454,6 +454,8 @@ def evaluate(self, individual):
cv_scores = cv_results[f"test_{self.refit_metric}"]
score = np.mean(cv_scores)

novelty_score = novelty_scorer(individual, self._pop)

# Uses the log config to save in remote log server (e.g MLflow)
if self.log_config is not None:
self.log_config.create_run(
Expand All @@ -480,7 +482,7 @@ def evaluate(self, individual):
# Log the hyperparameters and the cv-score
self.logbook.record(parameters=current_generation_params)

fitness_result = [score]
fitness_result = [score, novelty_score]

if self.use_cache:
# Store the fitness result and the current generation parameters in the cache
Expand Down
40 changes: 40 additions & 0 deletions sklearn_genetic/utils/tools.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import random
import numpy as np


def mutFlipBit(individual, indpb):
Expand Down Expand Up @@ -67,3 +68,42 @@ def check_bool_individual(individual):
individual[index] = 1

return individual


def novelty_scorer(individual, population, k=15):
"""
Calculate novelty score for an individual based on its distance from other individuals in the population.
Parameters
----------
individual: Individual object
The individual (set of hyperparameters) that is being evaluated.
population: List[Individual]
The current population of individuals.
k: int, default=15
The number of nearest neighbors to consider for the novelty calculation.
Returns
-------
novelty_score: float
The novelty score for the individual.
"""
distances = []

# Calculate distances between the individual and every other individual in the population
for other in population:
if other != individual:
# Here we use Hamming distance to measure difference
distance = sum(i != o for i, o in zip(individual, other))
distances.append(distance)

# Sort the distances and take the average of the k nearest neighbors
distances = sorted(distances)
k_min = min(k, len(population))
nearest_distances = distances[:k_min]

# Novelty score is the average distance to the k-nearest neighbors
novelty_score = np.mean(nearest_distances) if nearest_distances else 0
return novelty_score

0 comments on commit d8ae6a9

Please sign in to comment.