diff --git a/mteb/abstasks/AbsTaskInstructionRetrieval.py b/mteb/abstasks/AbsTaskInstructionRetrieval.py
index b30eb92945..a0107abc75 100644
--- a/mteb/abstasks/AbsTaskInstructionRetrieval.py
+++ b/mteb/abstasks/AbsTaskInstructionRetrieval.py
@@ -372,6 +372,7 @@ def _evaluate_subset_lang(
         )
 
         top_ranked = top_ranked[split]
+        kwargs["prediction_name"] = "og"  # for naming predictions, as needed
         scores_og, results_og = self._evaluate_subset(
             retriever,
             corpus,
@@ -382,6 +383,7 @@ def _evaluate_subset_lang(
             lang,
             **kwargs,
         )
+        kwargs["prediction_name"] = "changed"  # for naming predictions, as needed
         scores_changed, results_changed = self._evaluate_subset(
             retriever,
             corpus,
@@ -411,6 +413,7 @@ def _evaluate_subset_lang(
                 keywords[split],
                 short_instructions[split],
             )
+            kwargs["prediction_name"] = "base"  # for naming predictions, as needed
             scores_base, results_base = self._evaluate_subset(
                 retriever,
                 corpus,
@@ -421,6 +424,7 @@ def _evaluate_subset_lang(
                 lang,
                 **kwargs,
             )
+            kwargs["prediction_name"] = "keywords"  # for naming predictions, as needed
             scores_w_keywords_scores, scores_w_keywords_results = self._evaluate_subset(
                 retriever,
                 corpus,
@@ -431,6 +435,9 @@ def _evaluate_subset_lang(
                 lang,
                 **kwargs,
             )
+            kwargs["prediction_name"] = (
+                "short_instr"  # for naming predictions, as needed
+            )
             (
                 scores_w_short_instr_scores,
                 scores_w_short_instr_result,
@@ -572,6 +579,11 @@ def _evaluate_subset(
             else:
                 qrels_save_path = f"{output_folder}/{self.metadata_dict['name']}_{lang}_predictions.json"
 
+            if kwargs.get("prediction_name", None):
+                qrels_save_path = qrels_save_path.replace(
+                    ".json", f"_{kwargs['prediction_name']}.json"
+                )
+
             with open(qrels_save_path, "w") as f:
                 json.dump(results, f)
 
diff --git a/mteb/benchmarks/benchmarks.py b/mteb/benchmarks/benchmarks.py
index 01a5784757..cc075ec270 100644
--- a/mteb/benchmarks/benchmarks.py
+++ b/mteb/benchmarks/benchmarks.py
@@ -63,6 +63,7 @@ def load_results(
             base_results = load_results()
         return base_results.select_tasks(self.tasks)
 
+
 MTEB_MAIN_EN = Benchmark(
     name="MTEB(eng)",
     tasks=get_tasks(
diff --git a/mteb/evaluation/evaluators/InstructionRetrievalEvaluator.py b/mteb/evaluation/evaluators/InstructionRetrievalEvaluator.py
index 097b1d6a5b..f17dad9872 100644
--- a/mteb/evaluation/evaluators/InstructionRetrievalEvaluator.py
+++ b/mteb/evaluation/evaluators/InstructionRetrievalEvaluator.py
@@ -26,6 +26,19 @@ def __call__(
             return self.retriever.search_cross_encoder(
                 corpus, queries, self.top_k, instructions=instructions, **kwargs
             )
+        elif (
+            hasattr(self.retriever.model, "mteb_model_meta")
+            and self.retriever.model.mteb_model_meta.name == "bm25s"
+        ):
+            return self.retriever.model.search(
+                corpus,
+                queries,
+                self.top_k,
+                self.score_function,
+                task_name=self.task_name,  # type: ignore
+                instructions=instructions,
+                **kwargs,
+            )
         else:
             return self.retriever.search(
                 corpus,
diff --git a/mteb/evaluation/evaluators/RetrievalEvaluator.py b/mteb/evaluation/evaluators/RetrievalEvaluator.py
index 470d824384..54e2e0acd8 100644
--- a/mteb/evaluation/evaluators/RetrievalEvaluator.py
+++ b/mteb/evaluation/evaluators/RetrievalEvaluator.py
@@ -257,11 +257,18 @@ def search_cross_encoder(
     ) -> dict[str, dict[str, float]]:
         """This function provides support for reranker (or cross-encoder) models that encoder query and document at the same time (typically with attention).
         Some notable examples include MonoBERT, MonoT5, RankLlama, etc.
-        Note: you must provide the path to the results to rerank to the __init__ function as `previous_results`
+        Note: you must provide the path to the results to rerank to the __init__ function as `previous_results` or else rerank all documents in the corpus
         """
         pairs = []  # create the pairs for reranking
         for qid in queries.keys():
-            q_results = self.previous_results[qid]
+            if self.previous_results is None:
+                # try to use all of them
+                logging.logging(
+                    f"previous_results is None. Using all the documents to rerank: {len(corpus)}"
+                )
+                q_results = {doc_id: 0.0 for doc_id in corpus.keys()}
+            else:
+                q_results = self.previous_results[qid]
             # take the top-k only
             q_results_sorted = dict(
                 sorted(q_results.items(), key=lambda item: item[1], reverse=True)
diff --git a/mteb/tasks/InstructionRetrieval/__init__.py b/mteb/tasks/InstructionRetrieval/__init__.py
index f032908014..f5e812247d 100644
--- a/mteb/tasks/InstructionRetrieval/__init__.py
+++ b/mteb/tasks/InstructionRetrieval/__init__.py
@@ -3,3 +3,4 @@
 from .eng.Core17InstructionRetrieval import *
 from .eng.News21InstructionRetrieval import *
 from .eng.Robust04InstructionRetrieval import *
+from .multilingual.mFollowIR import *
diff --git a/mteb/tasks/InstructionRetrieval/eng/Core17InstructionRetrieval.py b/mteb/tasks/InstructionRetrieval/eng/Core17InstructionRetrieval.py
index 167d623a69..9b52f282b2 100644
--- a/mteb/tasks/InstructionRetrieval/eng/Core17InstructionRetrieval.py
+++ b/mteb/tasks/InstructionRetrieval/eng/Core17InstructionRetrieval.py
@@ -8,7 +8,7 @@
 class Core17InstructionRetrieval(AbsTaskInstructionRetrieval):
     metadata = TaskMetadata(
         name="Core17InstructionRetrieval",
-        description="Measuring retrieval instruction following ability on Core17 narratives.",
+        description="Measuring retrieval instruction following ability on Core17 narratives for the FollowIR benchmark.",
         reference="https://arxiv.org/abs/2403.15246",
         dataset={
             "path": "jhu-clsp/core17-instructions",
diff --git a/mteb/tasks/InstructionRetrieval/eng/News21InstructionRetrieval.py b/mteb/tasks/InstructionRetrieval/eng/News21InstructionRetrieval.py
index 4fbf8cda85..d693091279 100644
--- a/mteb/tasks/InstructionRetrieval/eng/News21InstructionRetrieval.py
+++ b/mteb/tasks/InstructionRetrieval/eng/News21InstructionRetrieval.py
@@ -8,7 +8,7 @@
 class News21InstructionRetrieval(AbsTaskInstructionRetrieval):
     metadata = TaskMetadata(
         name="News21InstructionRetrieval",
-        description="Measuring retrieval instruction following ability on News21 narratives.",
+        description="Measuring retrieval instruction following ability on News21 narratives for the FollowIR benchmark.",
         reference="https://arxiv.org/abs/2403.15246",
         dataset={
             "path": "jhu-clsp/news21-instructions",
diff --git a/mteb/tasks/InstructionRetrieval/eng/Robust04InstructionRetrieval.py b/mteb/tasks/InstructionRetrieval/eng/Robust04InstructionRetrieval.py
index e09bb4593f..c68dfabc18 100644
--- a/mteb/tasks/InstructionRetrieval/eng/Robust04InstructionRetrieval.py
+++ b/mteb/tasks/InstructionRetrieval/eng/Robust04InstructionRetrieval.py
@@ -8,7 +8,7 @@
 class Robust04InstructionRetrieval(AbsTaskInstructionRetrieval):
     metadata = TaskMetadata(
         name="Robust04InstructionRetrieval",
-        description="Measuring retrieval instruction following ability on Robust04 narratives.",
+        description="Measuring retrieval instruction following ability on Robust04 narratives for the FollowIR benchmark.",
         reference="https://arxiv.org/abs/2403.15246",
         dataset={
             "path": "jhu-clsp/robust04-instructions",
diff --git a/mteb/tasks/InstructionRetrieval/multilingual/__init__.py b/mteb/tasks/InstructionRetrieval/multilingual/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/mteb/tasks/InstructionRetrieval/multilingual/mFollowIR.py b/mteb/tasks/InstructionRetrieval/multilingual/mFollowIR.py
new file mode 100644
index 0000000000..04c1a56e19
--- /dev/null
+++ b/mteb/tasks/InstructionRetrieval/multilingual/mFollowIR.py
@@ -0,0 +1,368 @@
+from __future__ import annotations
+
+from collections import defaultdict
+
+import datasets
+
+from mteb.abstasks.MultilingualTask import MultilingualTask
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
+from ....abstasks.AbsTaskInstructionRetrieval import AbsTaskInstructionRetrieval
+
+_LANGUAGES = {
+    "fas": ["fas-Arab"],
+    "rus": ["rus-Cyrl"],
+    "zho": ["zho-Hans"],
+}
+
+_LANGUAGES_CLIR = {
+    "eng.fas": ["eng-Latn", "fas-Arab"],
+    "eng.rus": ["eng-Latn", "rus-Cyrl"],
+    "eng.zho": ["eng-Latn", "zho-Hans"],
+}
+
+
+def _build_lang_pair(langs: list[str]) -> str:
+    """Builds a language pair separated by a dash.
+    e.g., ['eng-Latn', 'deu-Latn'] -> 'eng-deu'.
+    """
+    return langs[0].split("-")[0] + "-" + langs[1].split("-")[0]
+
+
+def extend_lang_pairs() -> dict[str, list[str]]:
+    eval_langs = {}
+    for langs in _LANGUAGES_CLIR.values():
+        lang_pair = _build_lang_pair(langs)
+        eval_langs[lang_pair] = langs
+    return eval_langs
+
+
+_CLIR_LANGS = extend_lang_pairs()
+
+EVAL_SPLIT = "test"
+
+
+def load_data(
+    path: str,
+    langs: list,
+    eval_splits: list,
+    cache_dir: str | None = None,
+    revision: str | None = None,
+):
+    corpus = {lang: {EVAL_SPLIT: {}} for lang in langs}
+    queries = {lang: {EVAL_SPLIT: {}} for lang in langs}
+    og_relevant_docs = {lang: {EVAL_SPLIT: {}} for lang in langs}
+    changed_relevant_docs = {lang: {EVAL_SPLIT: {}} for lang in langs}
+    top_ranked = {lang: {EVAL_SPLIT: {}} for lang in langs}
+
+    for lang in langs:
+        if "-" in lang:
+            loading_lang = lang.split("-")[1]  # don't care about the eng part
+        else:
+            loading_lang = lang
+
+        # Load corpus data
+        corpus_data = datasets.load_dataset(
+            path,
+            f"corpus-{loading_lang}",
+            cache_dir=cache_dir,
+            revision=revision,
+            trust_remote_code=True,
+        )
+        corpus[lang][EVAL_SPLIT] = {
+            row["_id"]: {"title": row["title"], "text": row["text"]}
+            for row in corpus_data["corpus"]
+        }
+
+        # Load queries data
+        queries_data = datasets.load_dataset(
+            path,
+            f"queries-{loading_lang}",
+            cache_dir=cache_dir,
+            revision=revision,
+            trust_remote_code=True,
+        )
+        queries[lang][EVAL_SPLIT] = {
+            row["_id"]: {
+                "text": row["text"],
+                "instruction_og": row["instruction_og"],
+                "instruction_changed": row["instruction_changed"],
+                "keywords": row["keywords"] if "keywords" in row else None,
+                "short_query": row["short_query"] if "short_query" in row else None,
+            }
+            for row in queries_data["queries"]
+        }
+
+        # Load qrels_og data
+        qrels_og_data = datasets.load_dataset(
+            path,
+            f"qrels_og-{loading_lang}",
+            cache_dir=cache_dir,
+            revision=revision,
+            trust_remote_code=True,
+        )
+        for row in qrels_og_data[EVAL_SPLIT]:
+            if row["query-id"] not in og_relevant_docs[lang][EVAL_SPLIT]:
+                og_relevant_docs[lang][EVAL_SPLIT][row["query-id"]] = {
+                    row["corpus-id"]: int(row["score"])
+                }
+            else:
+                og_relevant_docs[lang][EVAL_SPLIT][row["query-id"]][
+                    row["corpus-id"]
+                ] = int(row["score"])
+
+        # Load qrels_changed data
+        qrels_changed_data = datasets.load_dataset(
+            path,
+            f"qrels_changed-{loading_lang}",
+            cache_dir=cache_dir,
+            revision=revision,
+            trust_remote_code=True,
+        )
+        for row in qrels_changed_data[EVAL_SPLIT]:
+            if row["query-id"] not in changed_relevant_docs[lang][EVAL_SPLIT]:
+                changed_relevant_docs[lang][EVAL_SPLIT][row["query-id"]] = {
+                    row["corpus-id"]: int(row["score"])
+                }
+            else:
+                changed_relevant_docs[lang][EVAL_SPLIT][row["query-id"]][
+                    row["corpus-id"]
+                ] = int(row["score"])
+
+        # Load top_ranked data
+        top_ranked_data = datasets.load_dataset(
+            path,
+            f"top_ranked-{loading_lang}",
+            cache_dir=cache_dir,
+            revision=revision,
+            trust_remote_code=True,
+        )
+        for row in top_ranked_data["top_ranked"]:
+            if row["qid"] not in top_ranked[lang][EVAL_SPLIT]:
+                top_ranked[lang][EVAL_SPLIT][row["qid"]] = [row["pid"]]
+            else:
+                top_ranked[lang][EVAL_SPLIT][row["qid"]].append(row["pid"])
+
+    # make og_instructions and changed_instructions from queries and then turn queries into just queries
+    og_instructions = {lang: {EVAL_SPLIT: defaultdict(dict)} for lang in queries}
+    changed_instructions = {lang: {EVAL_SPLIT: defaultdict(dict)} for lang in queries}
+    queries_only = {lang: {EVAL_SPLIT: {}} for lang in queries}
+    for lang in queries:
+        for split in queries[lang]:
+            for qid in queries[lang][split]:
+                text = queries[lang][split][qid]["text"]
+                og_instructions[lang][split][text] = queries[lang][split][qid][
+                    "instruction_og"
+                ]
+                changed_instructions[lang][split][text] = queries[lang][split][qid][
+                    "instruction_changed"
+                ]
+                queries_only[lang][split][qid] = text
+
+    queries = queries_only
+
+    return (
+        corpus,
+        queries,
+        og_instructions,
+        changed_instructions,
+        og_relevant_docs,
+        changed_relevant_docs,
+        top_ranked,
+    )
+
+
+class mFollowIRCrossLingual(MultilingualTask, AbsTaskInstructionRetrieval):
+    metadata = TaskMetadata(
+        name="mFollowIRCrossLingualInstructionRetrieval",
+        description="This tasks measures retrieval instruction following ability on NeuCLIR narratives for the mFollowIR benchmark on the Farsi, Russian, and Chinese languages with English queries/instructions.",
+        reference="https://neuclir.github.io/",
+        dataset={
+            "path": "jhu-clsp/mFollowIR-cross-lingual-parquet",
+            "revision": "7a82814a53229d3c8f18b2e18762a1a959dc5ff6",
+        },
+        type="Retrieval",
+        category="s2p",
+        modalities=["text"],
+        eval_splits=[EVAL_SPLIT],
+        eval_langs=_CLIR_LANGS,
+        main_score="p-MRR",
+        date=("2021-08-01", "2022-06-30"),
+        domains=["News", "Written"],
+        task_subtypes=[],
+        license="odc-by",
+        annotations_creators="expert-annotated",
+        dialect=[],
+        sample_creation="found",
+        bibtex_citation="""@article{weller2024mfollowir,
+  title={{mFollowIR: a Multilingual Benchmark for Instruction Following in Retrieval}},
+  author={Weller, Orion and Chang, Benjamin and Yang, Eugene and Yarmohammadi, Mahsa and Barham, Sam and MacAvaney, Sean and Cohan, Arman and Soldaini, Luca and Van Durme, Benjamin and Lawrie, Dawn},
+  journal={arXiv preprint TODO},
+  year={2024}
+}""",
+        descriptive_stats={
+            "n_samples": {"eng-fas": 40 * 2, "eng-rus": 40 * 2, "eng-zho": 43 * 2},
+            "test": {
+                "num_docs": 121635,
+                "num_queries": 123,
+                "average_document_length": 2331.0777818884367,
+                "average_query_length": 81.8780487804878,
+                "average_instruction_length": 389.9512195121951,
+                "average_changed_instruction_length": 450.5528455284553,
+                "average_relevant_docs_per_query": 10.30952380952381,
+                "average_top_ranked_per_query": 1024.3902439024391,
+                "hf_subset_descriptive_stats": {
+                    "eng-fas": {
+                        "num_docs": 41189,
+                        "num_queries": 40,
+                        "average_document_length": 3145.4990895627475,
+                        "average_query_length": 80.075,
+                        "average_instruction_length": 396.875,
+                        "average_changed_instruction_length": 463.175,
+                        "average_relevant_docs_per_query": 10.465116279069768,
+                        "average_top_ranked_per_query": 1075,
+                    },
+                    "eng-rus": {
+                        "num_docs": 39326,
+                        "num_queries": 40,
+                        "average_document_length": 2784.0813456746173,
+                        "average_query_length": 81.875,
+                        "average_instruction_length": 371.125,
+                        "average_changed_instruction_length": 431.8,
+                        "average_relevant_docs_per_query": 9.775,
+                        "average_top_ranked_per_query": 1000,
+                    },
+                    "eng-zho": {
+                        "num_docs": 41120,
+                        "num_queries": 43,
+                        "average_document_length": 1082.0501215953307,
+                        "average_query_length": 83.55813953488372,
+                        "average_instruction_length": 401.0232558139535,
+                        "average_changed_instruction_length": 456.25581395348837,
+                        "average_relevant_docs_per_query": 10.651162790697674,
+                        "average_top_ranked_per_query": 1000,
+                    },
+                },
+            },
+        },
+    )
+
+    def load_data(self, **kwargs):
+        if self.data_loaded:
+            return
+
+        (
+            self.corpus,
+            self.queries,
+            self.og_instructions,
+            self.changed_instructions,
+            self.og_relevant_docs,
+            self.changed_relevant_docs,
+            self.top_ranked,
+        ) = load_data(
+            path=self.metadata_dict["dataset"]["path"],
+            langs=self.metadata.eval_langs,
+            eval_splits=self.metadata_dict["eval_splits"],
+            cache_dir=kwargs.get("cache_dir", None),
+            revision=self.metadata_dict["dataset"]["revision"],
+        )
+
+        self.data_loaded = True
+
+
+class mFollowIR(MultilingualTask, AbsTaskInstructionRetrieval):
+    metadata = TaskMetadata(
+        name="mFollowIRInstructionRetrieval",
+        description="This tasks measures retrieval instruction following ability on NeuCLIR narratives for the mFollowIR benchmark on the Farsi, Russian, and Chinese languages.",
+        reference="https://neuclir.github.io/",
+        dataset={
+            "path": "jhu-clsp/mFollowIR-parquet",
+            "revision": "2c5cdcb438eff9de6412803768ac7304d4771cdc",
+        },
+        type="Retrieval",
+        category="s2p",
+        modalities=["text"],
+        eval_splits=[EVAL_SPLIT],
+        eval_langs=_LANGUAGES,
+        main_score="p-MRR",
+        date=("2021-08-01", "2022-06-30"),
+        domains=["News", "Written"],
+        task_subtypes=[],
+        license="odc-by",
+        annotations_creators="expert-annotated",
+        dialect=[],
+        sample_creation="found",
+        bibtex_citation="""@article{weller2024mfollowir,
+  title={{mFollowIR: a Multilingual Benchmark for Instruction Following in Retrieval}},
+  author={Weller, Orion and Chang, Benjamin and Yang, Eugene and Yarmohammadi, Mahsa and Barham, Sam and MacAvaney, Sean and Cohan, Arman and Soldaini, Luca and Van Durme, Benjamin and Lawrie, Dawn},
+  journal={arXiv preprint TODO},
+  year={2024}
+}""",
+        descriptive_stats={
+            "n_samples": {"fas": 40 * 2, "rus": 40 * 2, "zho": 43 * 2},
+            "test": {
+                "num_docs": 121635,
+                "num_queries": 123,
+                "average_document_length": 2331.0777818884367,
+                "average_query_length": 57.113821138211385,
+                "average_instruction_length": 281.0650406504065,
+                "average_changed_instruction_length": 326.9430894308943,
+                "average_relevant_docs_per_query": 10.30952380952381,
+                "average_top_ranked_per_query": 1024.3902439024391,
+                "hf_subset_descriptive_stats": {
+                    "fas": {
+                        "num_docs": 41189,
+                        "num_queries": 40,
+                        "average_document_length": 3145.4990895627475,
+                        "average_query_length": 72.65,
+                        "average_instruction_length": 358.925,
+                        "average_changed_instruction_length": 415.325,
+                        "average_relevant_docs_per_query": 10.465116279069768,
+                        "average_top_ranked_per_query": 1075,
+                    },
+                    "rus": {
+                        "num_docs": 39326,
+                        "num_queries": 40,
+                        "average_document_length": 2784.0813456746173,
+                        "average_query_length": 77.5,
+                        "average_instruction_length": 387,
+                        "average_changed_instruction_length": 458,
+                        "average_relevant_docs_per_query": 9.775,
+                        "average_top_ranked_per_query": 1000,
+                    },
+                    "zho": {
+                        "num_docs": 41120,
+                        "num_queries": 43,
+                        "average_document_length": 1082.0501215953307,
+                        "average_query_length": 23.697674418604652,
+                        "average_instruction_length": 110.09302325581395,
+                        "average_changed_instruction_length": 122.81395348837209,
+                        "average_relevant_docs_per_query": 10.651162790697674,
+                        "average_top_ranked_per_query": 1000,
+                    },
+                },
+            },
+        },
+    )
+
+    def load_data(self, **kwargs):
+        if self.data_loaded:
+            return
+
+        (
+            self.corpus,
+            self.queries,
+            self.og_instructions,
+            self.changed_instructions,
+            self.og_relevant_docs,
+            self.changed_relevant_docs,
+            self.top_ranked,
+        ) = load_data(
+            path=self.metadata_dict["dataset"]["path"],
+            langs=self.metadata.eval_langs,
+            eval_splits=self.metadata_dict["eval_splits"],
+            cache_dir=kwargs.get("cache_dir", None),
+            revision=self.metadata_dict["dataset"]["revision"],
+        )
+
+        self.data_loaded = True