Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 6 additions & 2 deletions dockerfiles/Dockerfile.nemo-skills
Original file line number Diff line number Diff line change
Expand Up @@ -37,8 +37,12 @@ RUN cd /opt/gorilla/berkeley-function-call-leaderboard && pip install --no-cache
RUN apt remove -y python3-blinker

# ifbench
RUN git clone https://github.com/allenai/IFBench.git /opt/benchmarks/IFBench --depth=1
RUN cd /opt/benchmarks/IFBench && pip install -r requirements.txt
ARG IFBENCH_COMMIT=c6767a19bd82ac0536cab950f2f8f6bcc6fabe7c
ARG IFBENCH_REPO=https://github.com/allenai/IFBench.git
ARG IFBENCH_DIR=/opt/benchmarks/IFBench
RUN git init "$IFBENCH_DIR" && cd "$IFBENCH_DIR" && git remote add origin "$IFBENCH_REPO" && \
git fetch --depth 1 origin "${IFBENCH_COMMIT}" && git reset --hard FETCH_HEAD
RUN cd ${IFBENCH_DIR} && pip install -r requirements.txt

# removing on-the-fly installation in ifbench to avoid conflicts from parallel jobs
COPY dockerfiles/ifbench.patch /opt/benchmarks/IFBench/ifbench.patch
Expand Down
75 changes: 69 additions & 6 deletions dockerfiles/ifbench.patch
Original file line number Diff line number Diff line change
@@ -1,15 +1,78 @@
diff --git a/evaluation_lib.py b/evaluation_lib.py
index a0db9e7..912a26e 100644
--- a/evaluation_lib.py
+++ b/evaluation_lib.py
@@ -18,6 +18,7 @@
import collections
import dataclasses
import json
+import logging
from typing import Dict, Optional, Union

import instructions_registry
@@ -90,10 +91,19 @@ def test_instruction_following_strict(
if args and "prompt" in args:
instruction.build_description(prompt=inp.prompt)

- if response.strip() and instruction.check_following(response):
- is_following_list.append(True)
- else:
- is_following_list.append(False)
+
+ response_has_content = bool(response.strip())
+ follows_instruction = False
+ if response_has_content:
+ try:
+ follows_instruction = instruction.check_following(response)
+ except Exception: # pylint: disable=broad-except
+ logging.exception(
+ "check_following failed for instruction %s (prompt key %s)",
+ instruction_id,
+ inp.key,
+ )
+ is_following_list.append(response_has_content and follows_instruction)

return OutputExample(
instruction_id_list=inp.instruction_id_list,
@@ -142,9 +152,18 @@ def test_instruction_following_loose(

is_following = False
for r in all_responses:
- if r.strip() and instruction.check_following(r):
- is_following = True
- break
+ if not r.strip():
+ continue
+ try:
+ if instruction.check_following(r):
+ is_following = True
+ break
+ except Exception: # pylint: disable=broad-except
+ logging.exception(
+ "check_following failed for instruction %s (prompt key %s)",
+ instruction_id,
+ inp.key,
+ )

is_following_list.append(is_following)

@@ -217,3 +236,4 @@ def print_report(outputs):
for instruction_id in sorted(tier1_total.keys()):
accuracy = tier1_correct[instruction_id] / tier1_total[instruction_id]
print(f"{instruction_id} {accuracy}")
+
diff --git a/instructions.py b/instructions.py
index d3071c7..e303de7 100644
index f32ff48..e587c9e 100644
--- a/instructions.py
+++ b/instructions.py
@@ -31,7 +31,9 @@ import io

@@ -30,7 +30,9 @@ import io
import instructions_util

-download('en_core_web_sm')
+# assumed to be predownloaded
+print("skipping download of en_core_web_sm")
+# download('en_core_web_sm')

logger = logging.getLogger(__name__)