embeddings-benchmark · KennethEnevoldsen · Oct 20, 2025 · Oct 20, 2025 · Oct 20, 2025
diff --git a/mteb/abstasks/clustering.py b/mteb/abstasks/clustering.py
@@ -1,6 +1,5 @@
 import itertools
 import logging
-import os
 import random
 from collections import defaultdict
 from pathlib import Path
@@ -32,7 +31,6 @@
 
 
 MultilingualDataset = dict[HFSubset, DatasetDict]
-OMP_NUM_THREADS = 4
 
 
 def _evaluate_clustering_bootstrapped(
@@ -55,17 +53,6 @@ def _evaluate_clustering_bootstrapped(
         - A dictionary where keys are level names (e.g., "Level 0", "Level 1", etc.) and values are lists of V-measure scores for each clustering experiment at that level.
         - A dictionary where keys are level names and values are lists of cluster assignments for each clustering experiment at that level.
     """
-    # set OMP_NUM_THREADS for reproductibility
-    if "OMP_NUM_THREADS" not in os.environ:
-        logger.info(
-            f"Setting OMP_NUM_THREADS to {OMP_NUM_THREADS} for clustering to ensure reproducibility."
-        )
-        os.environ["OMP_NUM_THREADS"] = str(OMP_NUM_THREADS)
-    else:
-        logger.info(
-            f"Using existing OMP_NUM_THREADS={os.environ['OMP_NUM_THREADS']} for clustering, this may lead to non-reproducible results. Set it to {OMP_NUM_THREADS} to ensure reproducibility."
-        )
-
     v_measures = defaultdict(list)
     cluster_assignments = defaultdict(list)
     if max_depth is not None:

diff --git a/tests/test_abstasks/test_predictions.py b/tests/test_abstasks/test_predictions.py
@@ -7,7 +7,6 @@
 from tests.mock_tasks import (
     MockBitextMiningTask,
     MockClassificationTask,
-    MockClusteringFastTask,
     MockClusteringTask,
     MockImageTextPairClassificationTask,
     MockPairClassificationTask,
@@ -97,10 +96,12 @@
             },
         ),
         (MockClusteringTask(), [[1, 2, 0]]),
-        (
-            MockClusteringFastTask(seed=1),
-            {"Level 0": [[0, 0, 0, 0], [0, 0, 1, 0], [0, 1, 2, 1]]},
-        ),
+        # TODO: #3441
+        # Disabled due to being too flaky.
+        # (
+        #     MockClusteringFastTask(seed=1),
+        #     {"Level 0": [[0, 0, 0, 0], [0, 0, 1, 0], [0, 1, 2, 1]]},
+        # ),
         (
             MockRegressionTask(),
             [pytest.approx([1.0000001192092896, 0.33665788173675537])] * 10,