Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 7 additions & 4 deletions dockerfiles/Dockerfile.nemo-skills
Original file line number Diff line number Diff line change
Expand Up @@ -33,10 +33,6 @@ RUN git clone https://github.com/google-research/google-research.git /opt/benchm
# ifbench
RUN git clone https://github.com/allenai/IFBench.git /opt/benchmarks/IFBench --depth=1
RUN cd /opt/benchmarks/IFBench && sed -i '/^unicodedata[=<>]*.*$/d' requirements.txt && pip install -r requirements.txt
RUN cd /opt/benchmarks

RUN pip install langdetect absl-py immutabledict nltk ipython && \
python -c "import nltk; nltk.download('punkt'); nltk.download('punkt_tab')"

RUN git clone https://github.com/ShishirPatil/gorilla.git /opt/gorilla
RUN cd /opt/gorilla && git checkout d2177992bbba9aa228b53c0645bf8f5613a5a7c6
Expand All @@ -49,3 +45,10 @@ COPY pyproject.toml README.md /opt/NeMo-Skills/
COPY nemo_skills /opt/NeMo-Skills/nemo_skills/
COPY requirements /opt/NeMo-Skills/requirements/
RUN cd /opt/NeMo-Skills && pip install -e .

# patching a bug in ifbench
COPY dockerfiles/ifbench.patch /opt/benchmarks/IFBench/ifbench.patch
RUN cd /opt/benchmarks/IFBench && git apply ifbench.patch

RUN pip install langdetect absl-py immutabledict nltk ipython && \
python -c "import nltk; from spacy.cli import download; nltk.download('punkt'); nltk.download('punkt_tab'); nltk.download('stopwords'); download('en_core_web_sm')"
40 changes: 40 additions & 0 deletions dockerfiles/ifbench.patch
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
diff --git a/instructions.py b/instructions.py
index 0195010..d57b7d1 100644
--- a/instructions.py
+++ b/instructions.py
@@ -31,7 +31,9 @@ import io

import instructions_util

-download('en_core_web_sm')
+# assumed to be predownloaded
+print("skipping download of en_core_web_sm")
+# download('en_core_web_sm')

logger = logging.getLogger(__name__)

@@ -866,12 +868,17 @@ class EmojiSentenceChecker(Instruction):
sentences = instructions_util.split_into_sentences(value)
for i, sentence in enumerate(sentences):
stripped = sentence.translate(str.maketrans('', '', string.punctuation)).strip()
+ if not stripped:
+ return False
last_char = stripped[-1]
# because blank spaces are treated oddly
second_last_char = stripped[-2] if len(stripped) > 1 else stripped[-1]
if not emoji.is_emoji(last_char) and not emoji.is_emoji(second_last_char):
if i < len(sentences) - 1:
stripped = sentences[i + 1].translate(str.maketrans('', '', string.punctuation)).strip()
+ # fixed empty string
+ if not stripped:
+ return False
first_char = stripped[0]
if not emoji.is_emoji(first_char):
return False
@@ -2246,4 +2253,4 @@ class NoWhitespaceChecker(Instruction):
True if the response contains no whitespace;
otherwise, False.
"""
- return not any(char.isspace() for char in value)
\ No newline at end of file
+ return not any(char.isspace() for char in value)
16 changes: 9 additions & 7 deletions nemo_skills/evaluation/evaluator/ifbench.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@

import json
import logging
import shutil
import subprocess
from pathlib import Path

Expand All @@ -24,24 +25,26 @@

def eval_ifbench(cfg):
for jsonl_file in unroll_files(cfg.input_files):
parent_dir = Path(jsonl_file).absolute().parent
jsonl_path = Path(jsonl_file).resolve()
output_dir = jsonl_path.parent / f"{jsonl_path.stem}_metrics_tmp"
output_dir.mkdir(parents=True, exist_ok=True)
cmd = (
"cd /opt/benchmarks/IFBench && python -m run_eval "
f"--input_data={jsonl_file} "
f"--input_response_data={jsonl_file} "
f"--output_dir={parent_dir} "
f"--output_dir={output_dir} "
)
subprocess.run(cmd, shell=True, check=True)
# fusing eval metrics back into the generation file
with open(jsonl_file, "rt", encoding="utf-8") as f:
samples = [json.loads(line) for line in f]

with open(parent_dir / "eval_results_loose.jsonl", "rt", encoding="utf-8") as f:
with open(output_dir / "eval_results_loose.jsonl", "rt", encoding="utf-8") as f:
eval_results = [json.loads(line) for line in f]
for sample, eval_result in zip(samples, eval_results):
sample["loose_eval"] = eval_result

with open(parent_dir / "eval_results_strict.jsonl", "rt", encoding="utf-8") as f:
with open(output_dir / "eval_results_strict.jsonl", "rt", encoding="utf-8") as f:
eval_results = [json.loads(line) for line in f]
for sample, eval_result in zip(samples, eval_results):
sample["strict_eval"] = eval_result
Expand All @@ -50,6 +53,5 @@ def eval_ifbench(cfg):
for sample in samples:
f.write(json.dumps(sample) + "\n")

# removing metric files to avoid reusing them
(parent_dir / "eval_results_loose.jsonl").unlink()
(parent_dir / "eval_results_strict.jsonl").unlink()
# removing temporary metric directory to avoid reusing it
shutil.rmtree(output_dir)
16 changes: 9 additions & 7 deletions nemo_skills/evaluation/evaluator/ifeval.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@

import json
import logging
import shutil
import subprocess
from pathlib import Path

Expand All @@ -24,24 +25,26 @@

def eval_if(cfg):
for jsonl_file in unroll_files(cfg.input_files):
parent_dir = Path(jsonl_file).absolute().parent
jsonl_path = Path(jsonl_file).resolve()
output_dir = jsonl_path.parent / f"{jsonl_path.stem}_metrics_tmp"
output_dir.mkdir(parents=True, exist_ok=True)
cmd = (
"cd /opt/benchmarks/google-research && python -m instruction_following_eval.evaluation_main "
f"--input_data={jsonl_file} "
f"--input_response_data={jsonl_file} "
f"--output_dir={parent_dir} "
f"--output_dir={output_dir} "
)
subprocess.run(cmd, shell=True, check=True)
# fusing eval metrics back into the generation file
with open(jsonl_file, "rt", encoding="utf-8") as f:
samples = [json.loads(line) for line in f]

with open(parent_dir / "eval_results_loose.jsonl", "rt", encoding="utf-8") as f:
with open(output_dir / "eval_results_loose.jsonl", "rt", encoding="utf-8") as f:
eval_results = [json.loads(line) for line in f]
for sample, eval_result in zip(samples, eval_results):
sample["loose_eval"] = eval_result

with open(parent_dir / "eval_results_strict.jsonl", "rt", encoding="utf-8") as f:
with open(output_dir / "eval_results_strict.jsonl", "rt", encoding="utf-8") as f:
eval_results = [json.loads(line) for line in f]
for sample, eval_result in zip(samples, eval_results):
sample["strict_eval"] = eval_result
Expand All @@ -50,6 +53,5 @@ def eval_if(cfg):
for sample in samples:
f.write(json.dumps(sample) + "\n")

# removing metric files to avoid reusing them
(parent_dir / "eval_results_loose.jsonl").unlink()
(parent_dir / "eval_results_strict.jsonl").unlink()
# removing temporary metric directory to avoid reusing it
shutil.rmtree(output_dir)