Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions open_instruct/ground_truth_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -887,6 +887,15 @@ def build_all_verifiers(args) -> Dict[str, VerifierFunction]:
for judge_type in JUDGE_PROMPT_MAP.keys():
instance = LMJudgeVerifier(judge_type, LMJudgeVerifierConfig.from_args(args))
verifiers[instance.name.lower()] = instance

# if we have remap arg, remap!
if args.remap_verifier:
remap = args.remap_verifier.split("=")
assert len(remap) == 2, "Remap must be in the format old_name=new_name"
old_name, new_name = remap
# map so that the old name calls the new verifier
assert new_name.lower() in verifiers, f"{new_name} not found in verifiers during remapping"
verifiers[old_name.lower()] = verifiers[new_name.lower()]

return verifiers

Expand Down
2 changes: 2 additions & 0 deletions open_instruct/grpo_fast.py
Original file line number Diff line number Diff line change
Expand Up @@ -272,6 +272,8 @@ class Args:
"""whether to apply verifiable reward"""
verification_reward: float = 10.0
"""the reward value for verifiable responses"""
remap_verifier: str = None
"""Remap verifier like string_f1=general-quality_ref. Currently can only remap once."""

# -- llm verifiers
llm_judge_model: str = "azure/gpt-4o-mini-standard"
Expand Down
2 changes: 2 additions & 0 deletions open_instruct/grpo_vllm_thread_ray_gtrl.py
Original file line number Diff line number Diff line change
Expand Up @@ -264,6 +264,8 @@ class Args:
"""whether to add the R1 style format reward"""
r1_style_format_reward: float = 1.0
"""the reward value for R1 style format reward"""
remap_verifier: str = None
"""Remap verifier like string_f1=general-quality_ref. Currently can only remap once."""

# async setting
async_mode: bool = True
Expand Down
22 changes: 19 additions & 3 deletions open_instruct/judge_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -214,11 +214,27 @@ def extract_json_score_with_fallback(score_str: str) -> "tuple[str, float]":
if cleaned_str.endswith("```"):
cleaned_str = cleaned_str[:-3] # Remove trailing ```

# escape newlines
cleaned_str = cleaned_str.replace("\r\n", "\n").replace("\n", "\\n")
# escape backslashes
cleaned_str = re.sub(r'\\(?!["\\/bfnrtu])', r'\\\\', cleaned_str)

cleaned_str = cleaned_str.strip()

data = json.loads(cleaned_str)
reasoning = data.get("REASONING", "")
return reasoning, float(data.get("SCORE", 0.0))
try:
data = json.loads(cleaned_str)
reasoning = data.get("REASONING", "")
score = float(data.get("SCORE", 0.0))
except json.JSONDecodeError:
# try just getting the score with some regex
score_match = re.search(r'"SCORE"\s*:\s*"?([0-9]+(?:\.[0-9]+)?)"?', cleaned_str)
if score_match:
score = float(score_match.group(1))
reasoning = cleaned_str
else:
# bubble up the error
raise ValueError()
return reasoning, score
except (json.JSONDecodeError, TypeError, ValueError):
logger.warning(f"Could not parse score from due to invalid json: {score_str}, defaulting to 0.0")
return score_str, 0.0
Expand Down
3 changes: 2 additions & 1 deletion open_instruct/model_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -261,8 +261,9 @@ async def apply_verifiable_reward(
tokenized_prediction=tok_prediction, prediction=prediction, label=gt, query=query
)
async_tasks.append(task)
# use reward_func.name to get the name of the verifier, rather than ds in case we have done remapping.
task_metadata.append(
{"response_idx": i, "dataset": ds, "reward_weight": reward_func.weight, "reward_mult": reward_mult}
{"response_idx": i, "dataset": reward_func.name, "reward_weight": reward_func.weight, "reward_mult": reward_mult}
)

# Execute all tasks in parallel
Expand Down
2 changes: 2 additions & 0 deletions open_instruct/ppo_fast.py
Original file line number Diff line number Diff line change
Expand Up @@ -273,6 +273,8 @@ class Args:
"""whether to apply verifiable reward"""
verification_reward: float = 10.0
"""the reward value for verifiable responses"""
remap_verifier: str = None
"""Remap verifier like string_f1=general-quality_ref. Currently can only remap once."""

# -- llm verifiers reward
llm_judge_model: str = "azure/gpt-4o-mini-standard"
Expand Down
2 changes: 2 additions & 0 deletions open_instruct/ppo_vllm_thread_ray_gtrl.py
Original file line number Diff line number Diff line change
Expand Up @@ -269,6 +269,8 @@ class Args:
"""whether to add the R1 style format reward"""
r1_style_format_reward: float = 1.0
"""the reward value for R1 style format reward"""
remap_verifier: str = None
"""Remap verifier like string_f1=general-quality_ref. Currently can only remap once."""

# async setting
async_mode: bool = True
Expand Down