diff --git a/mteb/abstasks/pair_classification.py b/mteb/abstasks/pair_classification.py index 180a560c3e..beec805197 100644 --- a/mteb/abstasks/pair_classification.py +++ b/mteb/abstasks/pair_classification.py @@ -25,6 +25,8 @@ if TYPE_CHECKING: from pathlib import Path + from numpy.typing import NDArray + from mteb._evaluators.pair_classification_evaluator import ( PairClassificationDistances, ) @@ -36,7 +38,6 @@ TextStatistics, ) - logger = logging.getLogger(__name__) @@ -138,7 +139,7 @@ def _compute_metrics( self, similarity_scores: PairClassificationDistances, labels: list[int] ) -> dict[str, float]: logger.info("Computing metrics...") - np_labels = np.asarray(labels) + np_labels: NDArray[np.int64] = np.asarray(labels, dtype=np.int64) output_scores = {} max_scores = defaultdict(list) for short_name, scores, reverse in [ @@ -281,7 +282,10 @@ def _push_dataset_to_hub(self, repo_name: str, num_proc: int = 1) -> None: ) def _compute_metrics_values( - self, scores: list[float], labels: np.ndarray, high_score_more_similar: bool + self, + scores: list[float], + labels: NDArray[np.int64], + high_score_more_similar: bool, ) -> dict[str, float]: """Compute the metrics for the given scores and labels. @@ -315,7 +319,10 @@ def _compute_metrics_values( ) def _find_best_acc_and_threshold( - self, scores: list[float], labels: np.ndarray, high_score_more_similar: bool + self, + scores: list[float], + labels: NDArray[np.int64], + high_score_more_similar: bool, ) -> tuple[float, float]: rows = list(zip(scores, labels)) rows = sorted(rows, key=lambda x: x[0], reverse=high_score_more_similar) @@ -323,7 +330,7 @@ def _find_best_acc_and_threshold( max_acc = 0 best_threshold = -1.0 positive_so_far = 0 - remaining_negatives = sum(np.array(labels) == 0) + remaining_negatives = sum(labels == 0) for i in range(len(rows) - 1): score, label = rows[i] @@ -339,10 +346,9 @@ def _find_best_acc_and_threshold( return max_acc, best_threshold def _find_best_f1_and_threshold( - self, scores, labels, high_score_more_similar: bool + self, scores, labels: NDArray[np.int64], high_score_more_similar: bool ) -> tuple[float, float, float, float]: scores = np.asarray(scores) - labels = np.asarray(labels) rows = list(zip(scores, labels)) diff --git a/mteb/tasks/pair_classification/fas/fars_tail.py b/mteb/tasks/pair_classification/fas/fars_tail.py index 96b9e8dad4..610802b193 100644 --- a/mteb/tasks/pair_classification/fas/fars_tail.py +++ b/mteb/tasks/pair_classification/fas/fars_tail.py @@ -1,5 +1,3 @@ -import datasets - from mteb.abstasks.pair_classification import AbsTaskPairClassification from mteb.abstasks.task_metadata import TaskMetadata @@ -8,8 +6,8 @@ class FarsTail(AbsTaskPairClassification): metadata = TaskMetadata( name="FarsTail", dataset={ - "path": "azarijafari/FarsTail", - "revision": "7335288588f14e5a687d97fc979194c2abe6f4e7", + "path": "mteb/FarsTail", + "revision": "0fa0863dc160869b5a2d78803b4440ea3c671ff5", }, description="This dataset, named FarsTail, includes 10,367 samples which are provided in both the Persian language as well as the indexed format to be useful for non-Persian researchers. The samples are generated from 3,539 multiple-choice questions with the least amount of annotator interventions in a way similar to the SciTail dataset", reference="https://link.springer.com/article/10.1007/s00500-023-08959-3", @@ -37,33 +35,3 @@ class FarsTail(AbsTaskPairClassification): } """, # after removing neutral ) - - def load_data(self, num_proc: int = 1, **kwargs) -> None: - if self.data_loaded: - return - path = self.metadata.dataset["path"] - revision = self.metadata.dataset["revision"] - data_files = { - "test": f"https://huggingface.co/datasets/{path}/resolve/{revision}/data/Test-word.csv" - } - self.dataset = datasets.load_dataset( - "csv", data_files=data_files, delimiter="\t" - ) - self.dataset_transform() - self.data_loaded = True - - def dataset_transform(self, num_proc: int = 1): - _dataset = {} - self.dataset = self.dataset.filter(lambda x: x["label"] != "n") - self.dataset = self.dataset.map( - lambda example: {"label": 1 if example["label"] == "e" else 0} - ) - for split in self.metadata.eval_splits: - _dataset[split] = [ - { - "sentence1": self.dataset[split]["premise"], - "sentence2": self.dataset[split]["hypothesis"], - "labels": self.dataset[split]["label"], - } - ] - self.dataset = _dataset diff --git a/pyproject.toml b/pyproject.toml index e1bf0079bb..160b38ed8d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -428,6 +428,7 @@ extend-exclude = [ "docs/references.bib", "mteb/models/model_implementations/gme_v_models.py", # video_grid_thw `thw` "mteb/models/model_implementations/vista_models.py", # self.normlized: in visual bge + "mteb/models/model_implementations/salesforce_models.py", # multiligual in paper title "tests/mock_tasks.py", # "denne her matche ikke den ovenstående", "mteb/models/model_implementations/kalm_models.py", # prompt: classify ist topic", "mteb/tasks/reranking/eng/built_bench_reranking.py", # prompt: descriptions from buit asset diff --git a/tests/test_search_index/test_search_index.py b/tests/test_search_index/test_search_index.py index 12d03f2b3c..d85f1c659f 100644 --- a/tests/test_search_index/test_search_index.py +++ b/tests/test_search_index/test_search_index.py @@ -30,6 +30,7 @@ def test_retrieval_backends( task: AbsTaskRetrieval, similarity: ScoringFunction, tmp_path: Path ): """Test different retrieval backends for retrieval and reranking tasks.""" + pytest.importorskip("faiss", reason="faiss is not installed") model = mteb.get_model("baseline/random-encoder-baseline") model_meta = deepcopy(model.mteb_model_meta) model_meta.similarity_fn_name = similarity