NVIDIA-NeMo · Kipok · Oct 9, 2025 · Oct 9, 2025 · Oct 9, 2025 · Oct 9, 2025
diff --git a/dockerfiles/Dockerfile.nemo-skills b/dockerfiles/Dockerfile.nemo-skills
@@ -33,10 +33,6 @@ RUN git clone https://github.com/google-research/google-research.git /opt/benchm
 # ifbench
 RUN git clone https://github.com/allenai/IFBench.git /opt/benchmarks/IFBench --depth=1
 RUN cd /opt/benchmarks/IFBench && sed -i '/^unicodedata[=<>]*.*$/d' requirements.txt && pip install -r requirements.txt
-RUN cd /opt/benchmarks
-
-RUN pip install langdetect absl-py immutabledict nltk ipython && \
-    python -c "import nltk; nltk.download('punkt'); nltk.download('punkt_tab')"
 
 RUN git clone https://github.com/ShishirPatil/gorilla.git /opt/gorilla
 RUN cd /opt/gorilla && git checkout d2177992bbba9aa228b53c0645bf8f5613a5a7c6
@@ -49,3 +45,10 @@ COPY pyproject.toml README.md /opt/NeMo-Skills/
 COPY nemo_skills /opt/NeMo-Skills/nemo_skills/
 COPY requirements /opt/NeMo-Skills/requirements/
 RUN cd /opt/NeMo-Skills && pip install -e .
+
+# patching a bug in ifbench
+COPY dockerfiles/ifbench.patch /opt/benchmarks/IFBench/ifbench.patch
+RUN cd /opt/benchmarks/IFBench && git apply ifbench.patch
+
+RUN pip install langdetect absl-py immutabledict nltk ipython && \
+    python -c "import nltk; from spacy.cli import download; nltk.download('punkt'); nltk.download('punkt_tab'); nltk.download('stopwords'); download('en_core_web_sm')"
diff --git a/dockerfiles/ifbench.patch b/dockerfiles/ifbench.patch
@@ -0,0 +1,40 @@
+diff --git a/instructions.py b/instructions.py
+index 0195010..d57b7d1 100644
+--- a/instructions.py
++++ b/instructions.py
+@@ -31,7 +31,9 @@ import io
+
+ import instructions_util
+
+-download('en_core_web_sm')
++# assumed to be predownloaded
++print("skipping download of en_core_web_sm")
++# download('en_core_web_sm')
+
+ logger = logging.getLogger(__name__)
+
+@@ -866,12 +868,17 @@ class EmojiSentenceChecker(Instruction):
+ 		sentences = instructions_util.split_into_sentences(value)
+ 		for i, sentence in enumerate(sentences):
+ 			stripped = sentence.translate(str.maketrans('', '', string.punctuation)).strip()
++			if not stripped:
++				return False
+ 			last_char = stripped[-1]
+ 			# because blank spaces are treated oddly
+ 			second_last_char = stripped[-2] if len(stripped) > 1 else stripped[-1]
+ 			if not emoji.is_emoji(last_char) and not emoji.is_emoji(second_last_char):
+ 				if i < len(sentences) - 1:
+ 					stripped = sentences[i + 1].translate(str.maketrans('', '', string.punctuation)).strip()
++					# fixed empty string
++					if not stripped:
++						return False
+ 					first_char = stripped[0]
+ 					if not emoji.is_emoji(first_char):
+ 						return False
+@@ -2246,4 +2253,4 @@ class NoWhitespaceChecker(Instruction):
+ 		  True if the response contains no whitespace;
+ 		  otherwise, False.
+ 		"""
+-		return not any(char.isspace() for char in value)
+\ No newline at end of file
++		return not any(char.isspace() for char in value)
diff --git a/nemo_skills/evaluation/evaluator/ifbench.py b/nemo_skills/evaluation/evaluator/ifbench.py
@@ -14,6 +14,7 @@
 
 import json
 import logging
+import shutil
 import subprocess
 from pathlib import Path
 
@@ -24,24 +25,26 @@
 
 def eval_ifbench(cfg):
     for jsonl_file in unroll_files(cfg.input_files):
-        parent_dir = Path(jsonl_file).absolute().parent
+        jsonl_path = Path(jsonl_file).resolve()
+        output_dir = jsonl_path.parent / f"{jsonl_path.stem}_metrics_tmp"
+        output_dir.mkdir(parents=True, exist_ok=True)
         cmd = (
             "cd /opt/benchmarks/IFBench && python -m run_eval "
             f"--input_data={jsonl_file} "
             f"--input_response_data={jsonl_file} "
-            f"--output_dir={parent_dir} "
+            f"--output_dir={output_dir} "
         )
         subprocess.run(cmd, shell=True, check=True)
         # fusing eval metrics back into the generation file
         with open(jsonl_file, "rt", encoding="utf-8") as f:
             samples = [json.loads(line) for line in f]
 
-        with open(parent_dir / "eval_results_loose.jsonl", "rt", encoding="utf-8") as f:
+        with open(output_dir / "eval_results_loose.jsonl", "rt", encoding="utf-8") as f:
             eval_results = [json.loads(line) for line in f]
         for sample, eval_result in zip(samples, eval_results):
             sample["loose_eval"] = eval_result
 
-        with open(parent_dir / "eval_results_strict.jsonl", "rt", encoding="utf-8") as f:
+        with open(output_dir / "eval_results_strict.jsonl", "rt", encoding="utf-8") as f:
             eval_results = [json.loads(line) for line in f]
         for sample, eval_result in zip(samples, eval_results):
             sample["strict_eval"] = eval_result
@@ -50,6 +53,5 @@ def eval_ifbench(cfg):
             for sample in samples:
                 f.write(json.dumps(sample) + "\n")
 
-        # removing metric files to avoid reusing them
-        (parent_dir / "eval_results_loose.jsonl").unlink()
-        (parent_dir / "eval_results_strict.jsonl").unlink()
+        # removing temporary metric directory to avoid reusing it
+        shutil.rmtree(output_dir)
diff --git a/nemo_skills/evaluation/evaluator/ifeval.py b/nemo_skills/evaluation/evaluator/ifeval.py
@@ -14,6 +14,7 @@
 
 import json
 import logging
+import shutil
 import subprocess
 from pathlib import Path
 
@@ -24,24 +25,26 @@
 
 def eval_if(cfg):
     for jsonl_file in unroll_files(cfg.input_files):
-        parent_dir = Path(jsonl_file).absolute().parent
+        jsonl_path = Path(jsonl_file).resolve()
+        output_dir = jsonl_path.parent / f"{jsonl_path.stem}_metrics_tmp"
+        output_dir.mkdir(parents=True, exist_ok=True)
         cmd = (
             "cd /opt/benchmarks/google-research && python -m instruction_following_eval.evaluation_main "
             f"--input_data={jsonl_file} "
             f"--input_response_data={jsonl_file} "
-            f"--output_dir={parent_dir} "
+            f"--output_dir={output_dir} "
         )
         subprocess.run(cmd, shell=True, check=True)
         # fusing eval metrics back into the generation file
         with open(jsonl_file, "rt", encoding="utf-8") as f:
             samples = [json.loads(line) for line in f]
 
-        with open(parent_dir / "eval_results_loose.jsonl", "rt", encoding="utf-8") as f:
+        with open(output_dir / "eval_results_loose.jsonl", "rt", encoding="utf-8") as f:
             eval_results = [json.loads(line) for line in f]
         for sample, eval_result in zip(samples, eval_results):
             sample["loose_eval"] = eval_result
 
-        with open(parent_dir / "eval_results_strict.jsonl", "rt", encoding="utf-8") as f:
+        with open(output_dir / "eval_results_strict.jsonl", "rt", encoding="utf-8") as f:
             eval_results = [json.loads(line) for line in f]
         for sample, eval_result in zip(samples, eval_results):
             sample["strict_eval"] = eval_result
@@ -50,6 +53,5 @@ def eval_if(cfg):
             for sample in samples:
                 f.write(json.dumps(sample) + "\n")
 
-        # removing metric files to avoid reusing them
-        (parent_dir / "eval_results_loose.jsonl").unlink()
-        (parent_dir / "eval_results_strict.jsonl").unlink()
+        # removing temporary metric directory to avoid reusing it
+        shutil.rmtree(output_dir)