run_strategyqa_llama2.py

import os
import re
import random
import time
import json
import torch
import wikienv, wrappers
from tqdm import tqdm
from transformers import GenerationConfig, LlamaForCausalLM, LlamaTokenizer, BitsAndBytesConfig
from utils.prompter import Prompter
from peft import PeftModel
from uncertainty_utils import *


base_model = "meta-llama/Llama-2-70b-hf"
load_in_4bit = False # set False to use 8-bit
mode = "uala" # choose from [standard, cot, react, uala]
oracle = True # whether to use oracle in uala
save_file_name = "outputs/llama2-strategyqa-dev-uala-oracle.jsonl" # saved file name

# load pre-calculated uncertainty threshold based on calibration set
uncertainty_threshold = 0.57


tokenizer = LlamaTokenizer.from_pretrained(base_model)
prompter = Prompter("llama2")
if load_in_4bit:
    bnb_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_compute_dtype=torch.float16,
            bnb_4bit_use_double_quant=True,
    )
    model = LlamaForCausalLM.from_pretrained(
            base_model,
            quantization_config=bnb_config,
            device_map="auto",
        )
else:
    model = LlamaForCausalLM.from_pretrained(
            base_model,
            load_in_8bit=True,
            torch_dtype=torch.float16,
            device_map="auto",
        )

model.eval()

def llama2_prompt(
    instruction,
    input=None,
    temperature=0,
    top_p=1,
    num_beams=1,
    do_sample=False,
    max_new_tokens=128,
    return_probs=False,
    **kwargs,
):
    prompt = prompter.generate_prompt(instruction, input)
    inputs = tokenizer(prompt, return_tensors="pt")
    input_ids = inputs["input_ids"].to("cuda")

    generation_config = GenerationConfig(
        temperature=temperature,
        top_p=top_p,
        num_beams=num_beams,
        do_sample=do_sample,
        **kwargs,
    )

    with torch.no_grad():
        generation_output = model.generate(
            input_ids=input_ids,
            generation_config=generation_config,
            return_dict_in_generate=True,
            output_scores=True,
            max_new_tokens=max_new_tokens,
        )

    input_length = inputs.input_ids.shape[1]
    generated_tokens = generation_output.sequences[:, input_length:]
    output = tokenizer.decode(generated_tokens[0])

    if return_probs:
        transition_scores = model.compute_transition_scores(
            generation_output.sequences, generation_output.scores, normalize_logits=True
        )
        prob_dicts = []
        for tok, score in zip(generated_tokens[0], transition_scores[0]):
            prob_dicts.append({tokenizer.decode(tok):score.cpu().tolist()})

        return output, prob_dicts

    else:
        return output

env = wikienv.WikiEnv()
env = wrappers.StrategyQAWrapper(env, split="dev")
env = wrappers.LoggingWrapper(env)

def step(env, action):
    attempts = 0
    while attempts < 10:
        try:
            return env.step(action)
        except requests.exceptions.Timeout:
            attempts += 1

prompt_file = './prompts/prompts.json'
with open(prompt_file, 'r') as f:
    prompt_dict = json.load(f)

# standard prompt
sqa_standard_examples = prompt_dict['sqa_standard']
instruction_standard = """Answer the question:\n"""

# cot prompt
sqa_cot_examples = prompt_dict['sqa_cot']
instruction_cot = """Solve a question answering task. Your task is to generate Thought and Answer where a Thought can reason about the current situation by thinking step by step.
Here are some examples.
"""

# react prompt
sqa_react_examples = prompt_dict['sqa_react']
instruction_react = """Solve a question answering task with interleaving Thought, Action, Observation steps. Thought can reason about the current situation, and Action can be three types: 
(1) Search[entity], which searches the exact entity on Wikipedia and returns the first paragraph if it exists. If not, it will return some similar entities to search.
(2) Lookup[keyword], which returns the next sentence containing keyword in the current passage.
(3) Finish[answer], which returns the answer and finishes the task.
Here are some examples.
"""

def standard(idx=None, instruction=instruction_standard, prompt=sqa_standard_examples, to_print=True):
    question = env.reset(idx=idx)
    if to_print:
        print(idx, question)
    prompt += question + "\n"
    answer, probs = llama2_prompt(instruction, prompt + "Answer:", return_probs=True)
    answer = answer.split("\n")[0].strip()
    if to_print:
        print("Answer:", answer)

    token_probs = []
    for d in probs:
        d = {k.replace('<0x0A>', '\n'): v for k, v in d.items()}
        key = [key for key in d.keys()][0]
        if key in answer:
            token_probs.append({key:d[key]})
        else:
            break
    return answer, token_probs

def cot(idx=None, instruction=instruction_cot, prompt=sqa_cot_examples, to_print=True):
    question = env.reset(idx=idx)
    if to_print:
        print(idx, question)
    prompt += question + "\n"
    answer, probs = llama2_prompt(instruction, prompt + "Thought:", return_probs=True)
    answer = answer.split("\nQuestion:")[0].strip()
    if to_print:
        print("Thought:", answer)

    token_probs = []
    for d in probs:
        d = {k.replace('<0x0A>', '\n'): v for k, v in d.items()}
        key = [key for key in d.keys()][0]
        if key in answer:
            token_probs.append({key:d[key]})
        else:
            break
    return answer, token_probs

def react(idx=None, instruction=instruction_react, prompt=sqa_react_examples, to_print=True):
    question = env.reset(idx=idx)
    if to_print:
        print(idx, question)
    prompt += question + "\n"
    n_calls, n_badcalls = 0, 0
    react_probs = []
    for i in range(1, 8):
        n_calls += 1
        thought_action, thought_action_probs = llama2_prompt(instruction, prompt + f"Thought {i}:", return_probs=True)
        react_probs.append(thought_action_probs)
        try:
            thought = thought_action.strip().split(f"\nAction {i}: ")[0]
            action = thought_action.strip().split(f"\nAction {i}: ")[1].split("\n")[0]
        except:
            print('ohh...', thought_action)
            n_badcalls += 1
            n_calls += 1
            thought = thought_action.strip().split('\n')[0]
            action, action_probs= llama2_prompt(instruction, prompt + f"Thought {i}: {thought}\nAction {i}:", return_probs=True)
            action = action.split("\n")[0].strip()
            react_probs.append(action_probs)
        obs, r, done, info = step(env, action[0].lower() + action[1:])
        obs = obs.replace('\\n', '')

        step_str = f"Thought {i}: {thought}\nAction {i}: {action}\nObservation {i}: {obs}\n"
        prompt += step_str
        if to_print:
            print(step_str)
        if done:
            break
    if not done:
        obs, r, done, info = step(env, "finish[]")

    if to_print:
        print(info, '\n')
    info.update({'n_calls': n_calls, 'n_badcalls': n_badcalls, 'traj': prompt})
    return info, react_probs


evals = []
old_time = time.time()

num_tool_call_instance = 0
num_instance = 0
num_correct = 0
num_tool_calls = 0
num_backoff = 0
num_ask_human = 0

with open(save_file_name,"a") as output_file:
    for i in tqdm(range(len(env))):
        question = env.reset(idx=i)
        gold_answer = env.data[i][1]
        num_instance += 1

        if mode == "standard":
            predicted_answer, _ = standard(i, to_print=True)
            print('-----------')
            em = (wrappers.clean_answer(predicted_answer) == gold_answer)
            standard_final_output = {"answer": predicted_answer, "gt_answer": gold_answer, "question_idx": i, "em": em, "traj": question + '\nAnswer:' + predicted_answer}
            if standard_final_output["em"]:
                num_correct += 1
            output_file.write(json.dumps(standard_final_output, ensure_ascii=False) + '\n')

        elif mode == "cot":
            cot_output, _ = cot(i, to_print=True)
            print('-----------')
            try:
                predicted_answer = cot_output.split('Answer:')[1].strip()
            except:
                try:
                    predicted_answer = cot_output.split("the answer is ")[1].strip()
                except:
                    predicted_answer = ""
            em = (wrappers.clean_answer(predicted_answer) == gold_answer)
            cot_final_output = {"answer": predicted_answer, "gt_answer": gold_answer, "question_idx": i, "em": em, "traj": question + '\nThought:' + cot_output}
            if cot_final_output["em"]:
                num_correct += 1
            output_file.write(json.dumps(cot_final_output, ensure_ascii=False) + '\n')

        elif mode == "react":
            info, _ = react(i, to_print=True)
            evals.append(info['em'])
            print(sum(evals), len(evals), sum(evals) / len(evals), (time.time() - old_time) / len(evals))
            print('-----------')
            info["traj"] = info["traj"].split(sqa_react_examples)[1].strip()
            num_tool_calls += info["n_calls"]
            if info["em"]:
                num_correct += 1
            output_file.write(json.dumps(info, ensure_ascii=False) + '\n')

        elif mode == "uala":
            print('-----------Standard-----------')
            predicted_answer, probs = standard(i, to_print=True)

            em = (wrappers.clean_answer(predicted_answer) == gold_answer)
            standard_final_output = {"steps": 2, "answer": predicted_answer, "gt_answer": gold_answer, "question_idx": i, "reward": em, "em": em, "n_calls": 0, "n_badcalls": 0, "traj": question + '\nThought 1:' + predicted_answer + f'\nAction 1: MeasureUncertainty [{predicted_answer}]'}
            
            # extract answer token logprobs for sqa
            answer_probs = []
            for d in probs:
                for value in d.values():
                    answer_probs.append(value)

            # calculate uncertainty
            uncertainty = cal_uncertainty_single_token(answer_probs)

            # make tool use
            if uncertainty > uncertainty_threshold:
                print(f'-----------Answer’s uncertainty {round(uncertainty,2)}, which falls outside the acceptable uncertainty threshold of {uncertainty_threshold}, I need to use an external tool to solve the question.-----------')
                num_tool_call_instance += 1
                print('-----------ReAct-----------')
                info, react_probs = react(i, to_print=True)
                predicted_react_answer = info["answer"]
                standard_final_output["traj"] += f"\nObservation 1: Answer’s uncertainty is {round(uncertainty,2)}, which falls outside the acceptable threshold of {uncertainty_threshold}.\nThought 2: Based on the uncertainty, I need to use an external tool to solve the question.\nAction 2: Activate Tool.\n"
                
                def reformat(text):
                    return f"{text.group(1)} {int(text.group(2)) + 2}"

                react_traj = info["traj"].split(sqa_react_examples + question + "\n")[1].strip()
                reformat_react_traj = re.compile(r'(Thought|Observation|Action) (\d+)').sub(reformat, react_traj)

                last_index = int(re.findall(r'(Thought|Observation|Action)\s*(\d+)', reformat_react_traj)[-1][-1])

                info["traj"] = standard_final_output["traj"] + reformat_react_traj 
                info["steps"] += standard_final_output["steps"]
                num_tool_calls += info["n_calls"]

                if info["answer"].lower() in ['yes','no']:
                    info["traj"] += f"\nThought {str(last_index+1)}: Let me check the uncertainty of the returned answer.\nAction {str(last_index+1)}: MeasureUncertainty [{predicted_react_answer}]"
                    info["steps"] += 1

                    # extract the answer token probs
                    answer_probs = []
                    for list in react_probs:
                        for key in list:
                            finish = False
                            if "Fin" in key:
                                answer_probs = []
                                idx = list.index(key)
                                for i in range(idx+2,len(list)):
                                    for key, value in list[i].items():
                                        if key == '[':
                                            continue
                                        if key == ']':
                                            finish = True
                                            break
                                        answer_probs.append(value)
                                    if finish:
                                        break
                    if not answer_probs:
                        answer_probs = [0.0]

                    react_uncertainty = cal_uncertainty_single_token(answer_probs)
                    if react_uncertainty > uncertainty_threshold:
                        info["steps"] += 1
                        if oracle:
                            print(f"-----------Answer’s uncertainty is {round(react_uncertainty,2)}, which falls outside the acceptable threshold of {uncertainty_threshold}, ask a human for help.-----------")
                            info["traj"] += f"\nObservation {str(last_index+2)}: Answer’s uncertainty is {round(react_uncertainty,2)}, which falls outside the acceptable threshold of {uncertainty_threshold}.\nThought {str(last_index+2)}: Based on the uncertainty, I need to ask a human for help.\nAction {str(last_index+2)}: Ask Human.\nObservation {str(last_index+2)}: {gold_answer}\nAnswer: {gold_answer}"
                            info["answer"] = gold_answer
                            info["reward"] = True
                            info["em"] = True
                            num_ask_human += 1
                        else:
                            print(f"-----------Answer’s uncertainty is {round(react_uncertainty,2)}, which falls outside the acceptable threshold of {uncertainty_threshold}, for simplicity, answer is still kept.-----------")
                            info["traj"] += f"\nObservation {str(last_index+2)}: Answer’s uncertainty is {round(react_uncertainty,2)}, which falls outside the acceptable threshold of {uncertainty_threshold}.\nThought {str(last_index+2)}: For simplicity, answer is still kept.\nAction {str(last_index+2)}: Keep Answer.\nAnswer: {predicted_react_answer}"
                            
                    else:
                        print(f"-----------Answer’s uncertainty is {round(react_uncertainty,2)}, which falls within the acceptable threshold of {uncertainty_threshold}, answer is kept.-----------")
                        info["traj"] += f"\nObservation {str(last_index+2)}: Answer’s uncertainty is {round(react_uncertainty,2)}, which falls within the acceptable threshold of {uncertainty_threshold}.\nThought {str(last_index+2)}: Based on the uncertainty, answer is kept.\nAction {str(last_index+2)}: Keep Answer.\nAnswer: {predicted_react_answer}"
                        info["steps"] += 1
                    
                else:
                    print("-----------Returned tool-use answer is invalid, I need to use the backoff answer.-----------")
                    use_backoff_traj = f"\nThought {str(last_index+1)}: Returned tool-use answer is invalid, I need to use the backoff answer.\nAction {str(last_index+1)}: Use Backoff Answer.\nAnswer: {predicted_answer}"
                    info["traj"] += use_backoff_traj
                    num_backoff += 1
                    info["answer"] = standard_final_output["answer"]
                    info["reward"] = standard_final_output["reward"]
                    info["em"] = standard_final_output["em"]

                if info["em"]:
                    num_correct += 1
                output_file.write(json.dumps(info, ensure_ascii=False) + '\n')
            else:
                print(f'-----------Answer’s uncertainty is {round(uncertainty,2)}, which falls within the acceptable threshold of {uncertainty_threshold}, answer is kept.-----------')
                standard_final_output["traj"] += f"\nObservation 1: Answer’s uncertainty is {round(uncertainty,2)}, which falls within the acceptable threshold of {uncertainty_threshold}.\nThought 2: Based on the uncertainty, answer is kept.\nAction 2: Keep Answer.\nAnswer: {predicted_answer}"
                if standard_final_output["em"]:
                    num_correct += 1
                output_file.write(json.dumps(standard_final_output, ensure_ascii=False) + '\n')