diff --git a/dockerfiles/Dockerfile.nemo-skills b/dockerfiles/Dockerfile.nemo-skills index bae7d5dd51..9918975bab 100644 --- a/dockerfiles/Dockerfile.nemo-skills +++ b/dockerfiles/Dockerfile.nemo-skills @@ -37,8 +37,12 @@ RUN cd /opt/gorilla/berkeley-function-call-leaderboard && pip install --no-cache RUN apt remove -y python3-blinker # ifbench -RUN git clone https://github.com/allenai/IFBench.git /opt/benchmarks/IFBench --depth=1 -RUN cd /opt/benchmarks/IFBench && pip install -r requirements.txt +ARG IFBENCH_COMMIT=c6767a19bd82ac0536cab950f2f8f6bcc6fabe7c +ARG IFBENCH_REPO=https://github.com/allenai/IFBench.git +ARG IFBENCH_DIR=/opt/benchmarks/IFBench +RUN git init "$IFBENCH_DIR" && cd "$IFBENCH_DIR" && git remote add origin "$IFBENCH_REPO" && \ + git fetch --depth 1 origin "${IFBENCH_COMMIT}" && git reset --hard FETCH_HEAD +RUN cd ${IFBENCH_DIR} && pip install -r requirements.txt # removing on-the-fly installation in ifbench to avoid conflicts from parallel jobs COPY dockerfiles/ifbench.patch /opt/benchmarks/IFBench/ifbench.patch diff --git a/dockerfiles/ifbench.patch b/dockerfiles/ifbench.patch index 510f8a1c01..4aab580b16 100644 --- a/dockerfiles/ifbench.patch +++ b/dockerfiles/ifbench.patch @@ -1,15 +1,78 @@ +diff --git a/evaluation_lib.py b/evaluation_lib.py +index a0db9e7..912a26e 100644 +--- a/evaluation_lib.py ++++ b/evaluation_lib.py +@@ -18,6 +18,7 @@ + import collections + import dataclasses + import json ++import logging + from typing import Dict, Optional, Union + + import instructions_registry +@@ -90,10 +91,19 @@ def test_instruction_following_strict( + if args and "prompt" in args: + instruction.build_description(prompt=inp.prompt) + +- if response.strip() and instruction.check_following(response): +- is_following_list.append(True) +- else: +- is_following_list.append(False) ++ ++ response_has_content = bool(response.strip()) ++ follows_instruction = False ++ if response_has_content: ++ try: ++ follows_instruction = instruction.check_following(response) ++ except Exception: # pylint: disable=broad-except ++ logging.exception( ++ "check_following failed for instruction %s (prompt key %s)", ++ instruction_id, ++ inp.key, ++ ) ++ is_following_list.append(response_has_content and follows_instruction) + + return OutputExample( + instruction_id_list=inp.instruction_id_list, +@@ -142,9 +152,18 @@ def test_instruction_following_loose( + + is_following = False + for r in all_responses: +- if r.strip() and instruction.check_following(r): +- is_following = True +- break ++ if not r.strip(): ++ continue ++ try: ++ if instruction.check_following(r): ++ is_following = True ++ break ++ except Exception: # pylint: disable=broad-except ++ logging.exception( ++ "check_following failed for instruction %s (prompt key %s)", ++ instruction_id, ++ inp.key, ++ ) + + is_following_list.append(is_following) + +@@ -217,3 +236,4 @@ def print_report(outputs): + for instruction_id in sorted(tier1_total.keys()): + accuracy = tier1_correct[instruction_id] / tier1_total[instruction_id] + print(f"{instruction_id} {accuracy}") ++ diff --git a/instructions.py b/instructions.py -index d3071c7..e303de7 100644 +index f32ff48..e587c9e 100644 --- a/instructions.py +++ b/instructions.py -@@ -31,7 +31,9 @@ import io - +@@ -30,7 +30,9 @@ import io + import instructions_util - + -download('en_core_web_sm') +# assumed to be predownloaded +print("skipping download of en_core_web_sm") +# download('en_core_web_sm') - + logger = logging.getLogger(__name__) - +