diff --git a/dockerfiles/Dockerfile.nemo-skills b/dockerfiles/Dockerfile.nemo-skills index fa0fcea631..56f83ec640 100644 --- a/dockerfiles/Dockerfile.nemo-skills +++ b/dockerfiles/Dockerfile.nemo-skills @@ -33,10 +33,6 @@ RUN git clone https://github.com/google-research/google-research.git /opt/benchm # ifbench RUN git clone https://github.com/allenai/IFBench.git /opt/benchmarks/IFBench --depth=1 RUN cd /opt/benchmarks/IFBench && sed -i '/^unicodedata[=<>]*.*$/d' requirements.txt && pip install -r requirements.txt -RUN cd /opt/benchmarks - -RUN pip install langdetect absl-py immutabledict nltk ipython && \ - python -c "import nltk; nltk.download('punkt'); nltk.download('punkt_tab')" RUN git clone https://github.com/ShishirPatil/gorilla.git /opt/gorilla RUN cd /opt/gorilla && git checkout d2177992bbba9aa228b53c0645bf8f5613a5a7c6 @@ -49,3 +45,10 @@ COPY pyproject.toml README.md /opt/NeMo-Skills/ COPY nemo_skills /opt/NeMo-Skills/nemo_skills/ COPY requirements /opt/NeMo-Skills/requirements/ RUN cd /opt/NeMo-Skills && pip install -e . + +# patching a bug in ifbench +COPY dockerfiles/ifbench.patch /opt/benchmarks/IFBench/ifbench.patch +RUN cd /opt/benchmarks/IFBench && git apply ifbench.patch + +RUN pip install langdetect absl-py immutabledict nltk ipython && \ + python -c "import nltk; from spacy.cli import download; nltk.download('punkt'); nltk.download('punkt_tab'); nltk.download('stopwords'); download('en_core_web_sm')" diff --git a/dockerfiles/ifbench.patch b/dockerfiles/ifbench.patch new file mode 100644 index 0000000000..54b685b064 --- /dev/null +++ b/dockerfiles/ifbench.patch @@ -0,0 +1,40 @@ +diff --git a/instructions.py b/instructions.py +index 0195010..d57b7d1 100644 +--- a/instructions.py ++++ b/instructions.py +@@ -31,7 +31,9 @@ import io + + import instructions_util + +-download('en_core_web_sm') ++# assumed to be predownloaded ++print("skipping download of en_core_web_sm") ++# download('en_core_web_sm') + + logger = logging.getLogger(__name__) + +@@ -866,12 +868,17 @@ class EmojiSentenceChecker(Instruction): + sentences = instructions_util.split_into_sentences(value) + for i, sentence in enumerate(sentences): + stripped = sentence.translate(str.maketrans('', '', string.punctuation)).strip() ++ if not stripped: ++ return False + last_char = stripped[-1] + # because blank spaces are treated oddly + second_last_char = stripped[-2] if len(stripped) > 1 else stripped[-1] + if not emoji.is_emoji(last_char) and not emoji.is_emoji(second_last_char): + if i < len(sentences) - 1: + stripped = sentences[i + 1].translate(str.maketrans('', '', string.punctuation)).strip() ++ # fixed empty string ++ if not stripped: ++ return False + first_char = stripped[0] + if not emoji.is_emoji(first_char): + return False +@@ -2246,4 +2253,4 @@ class NoWhitespaceChecker(Instruction): + True if the response contains no whitespace; + otherwise, False. + """ +- return not any(char.isspace() for char in value) +\ No newline at end of file ++ return not any(char.isspace() for char in value) diff --git a/nemo_skills/evaluation/evaluator/ifbench.py b/nemo_skills/evaluation/evaluator/ifbench.py index 310829d841..2b7e06ca0f 100644 --- a/nemo_skills/evaluation/evaluator/ifbench.py +++ b/nemo_skills/evaluation/evaluator/ifbench.py @@ -14,6 +14,7 @@ import json import logging +import shutil import subprocess from pathlib import Path @@ -24,24 +25,26 @@ def eval_ifbench(cfg): for jsonl_file in unroll_files(cfg.input_files): - parent_dir = Path(jsonl_file).absolute().parent + jsonl_path = Path(jsonl_file).resolve() + output_dir = jsonl_path.parent / f"{jsonl_path.stem}_metrics_tmp" + output_dir.mkdir(parents=True, exist_ok=True) cmd = ( "cd /opt/benchmarks/IFBench && python -m run_eval " f"--input_data={jsonl_file} " f"--input_response_data={jsonl_file} " - f"--output_dir={parent_dir} " + f"--output_dir={output_dir} " ) subprocess.run(cmd, shell=True, check=True) # fusing eval metrics back into the generation file with open(jsonl_file, "rt", encoding="utf-8") as f: samples = [json.loads(line) for line in f] - with open(parent_dir / "eval_results_loose.jsonl", "rt", encoding="utf-8") as f: + with open(output_dir / "eval_results_loose.jsonl", "rt", encoding="utf-8") as f: eval_results = [json.loads(line) for line in f] for sample, eval_result in zip(samples, eval_results): sample["loose_eval"] = eval_result - with open(parent_dir / "eval_results_strict.jsonl", "rt", encoding="utf-8") as f: + with open(output_dir / "eval_results_strict.jsonl", "rt", encoding="utf-8") as f: eval_results = [json.loads(line) for line in f] for sample, eval_result in zip(samples, eval_results): sample["strict_eval"] = eval_result @@ -50,6 +53,5 @@ def eval_ifbench(cfg): for sample in samples: f.write(json.dumps(sample) + "\n") - # removing metric files to avoid reusing them - (parent_dir / "eval_results_loose.jsonl").unlink() - (parent_dir / "eval_results_strict.jsonl").unlink() + # removing temporary metric directory to avoid reusing it + shutil.rmtree(output_dir) diff --git a/nemo_skills/evaluation/evaluator/ifeval.py b/nemo_skills/evaluation/evaluator/ifeval.py index 1865626851..ecf4e1f1d0 100644 --- a/nemo_skills/evaluation/evaluator/ifeval.py +++ b/nemo_skills/evaluation/evaluator/ifeval.py @@ -14,6 +14,7 @@ import json import logging +import shutil import subprocess from pathlib import Path @@ -24,24 +25,26 @@ def eval_if(cfg): for jsonl_file in unroll_files(cfg.input_files): - parent_dir = Path(jsonl_file).absolute().parent + jsonl_path = Path(jsonl_file).resolve() + output_dir = jsonl_path.parent / f"{jsonl_path.stem}_metrics_tmp" + output_dir.mkdir(parents=True, exist_ok=True) cmd = ( "cd /opt/benchmarks/google-research && python -m instruction_following_eval.evaluation_main " f"--input_data={jsonl_file} " f"--input_response_data={jsonl_file} " - f"--output_dir={parent_dir} " + f"--output_dir={output_dir} " ) subprocess.run(cmd, shell=True, check=True) # fusing eval metrics back into the generation file with open(jsonl_file, "rt", encoding="utf-8") as f: samples = [json.loads(line) for line in f] - with open(parent_dir / "eval_results_loose.jsonl", "rt", encoding="utf-8") as f: + with open(output_dir / "eval_results_loose.jsonl", "rt", encoding="utf-8") as f: eval_results = [json.loads(line) for line in f] for sample, eval_result in zip(samples, eval_results): sample["loose_eval"] = eval_result - with open(parent_dir / "eval_results_strict.jsonl", "rt", encoding="utf-8") as f: + with open(output_dir / "eval_results_strict.jsonl", "rt", encoding="utf-8") as f: eval_results = [json.loads(line) for line in f] for sample, eval_result in zip(samples, eval_results): sample["strict_eval"] = eval_result @@ -50,6 +53,5 @@ def eval_if(cfg): for sample in samples: f.write(json.dumps(sample) + "\n") - # removing metric files to avoid reusing them - (parent_dir / "eval_results_loose.jsonl").unlink() - (parent_dir / "eval_results_strict.jsonl").unlink() + # removing temporary metric directory to avoid reusing it + shutil.rmtree(output_dir)