#67 Add siq2 loc evaluation pipeline

zhanwenchen · Apr 11, 2024 · 02931f4 · 02931f4
1 parent 2a555f9
commit 02931f4
Show file tree

Hide file tree

Showing 3 changed files with 308 additions and 0 deletions.
diff --git a/quantitative_evaluation/evaluate_loc.py b/quantitative_evaluation/evaluate_loc.py
@@ -0,0 +1,170 @@
+import os
+import argparse
+import json
+from multiprocessing.pool import Pool
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="question-answer-generation-using-gpt-3")
+    parser.add_argument("--pred_path", required=True, help="The path to file containing prediction.")
+    parser.add_argument("--output_dir", required=True, help="The path to save annotation json files.")
+    parser.add_argument("--output_json", required=True, help="The path to save annotation final combined json file.")
+    parser.add_argument("--num_tasks", required=True, type=int, help="Number of splits.")
+    args = parser.parse_args()
+    return args
+
+
+def annotate(prediction_set, caption_files, output_dir):
+    """
+    Evaluates question and answer pairs using GPT-3
+    Returns a score for correctness.
+    """
+    for file in caption_files:
+        key = file[:-5] # Strip file extension
+        qa_set = prediction_set[key]
+        answer = qa_set['a']
+        pred = qa_set['pred']
+        # Compute the correctness score
+
+        # Convert response to a Python dictionary.
+        if answer == pred:
+            response_dict = {'pred': 'yes', 'score': 1}
+        else:
+            response_dict = {'pred': 'no', 'score': 0}
+        result_qa_pair = [response_dict, qa_set]
+
+        # Save the question-answer pairs to a json file.
+        with open(f"{output_dir}/{key}.json", "w") as f:
+            json.dump(result_qa_pair, f)
+
+
+
+def main():
+    """
+    Main function to control the flow of the program.
+    """
+    # Parse arguments.
+    args = parse_args()
+
+    file = open(args.pred_path)
+    pred_contents = json.load(file)
+
+    # Dictionary to store the count of occurrences for each video_id
+    video_id_counts = {}
+    new_pred_contents = []
+
+    # Iterate through each sample in pred_contents
+    for sample in pred_contents:
+        video_id = sample['id']
+        if video_id in video_id_counts:
+            video_id_counts[video_id] += 1
+        else:
+            video_id_counts[video_id] = 0
+
+        # Create a new sample with the modified key
+        new_sample = sample
+        new_sample['id'] = f"{video_id}_{video_id_counts[video_id]}"
+        new_pred_contents.append(new_sample)
+
+    # Generating list of id's and corresponding files
+    id_list = [x['id'] for x in new_pred_contents]
+    caption_files = [f"{id}.json" for id in id_list]
+
+    output_dir = args.output_dir
+    # Generate output directory if not exists.
+    if not os.path.exists(output_dir):
+        os.makedirs(output_dir)
+
+    # Preparing dictionary of question-answer sets
+    prediction_set = {}
+    for sample in new_pred_contents:
+        id = sample['id']
+        question = sample['question']
+        answer = sample['answer']
+        pred = sample['pred']
+        qa_set = {"q": question, "a": answer, "pred": pred}
+        prediction_set[id] = qa_set
+
+    # Set the OpenAI API key.
+    # openai.a  pi_key = args.api_key
+    num_tasks = args.num_tasks
+
+    # While loop to ensure that all captions are processed.
+    while True:
+        try:
+            # Files that have not been processed yet.
+            completed_files = os.listdir(output_dir)
+            print(f"completed_files: {len(completed_files)}")
+
+            # Files that have not been processed yet.
+            incomplete_files = [f for f in caption_files if f not in completed_files]
+            print(f"incomplete_files: {len(incomplete_files)}")
+
+            # Break the loop when there are no incomplete files
+            if len(incomplete_files) == 0:
+                break
+            if len(incomplete_files) <= num_tasks:
+                num_tasks = 1
+
+            # Split tasks into parts.
+            part_len = len(incomplete_files) // num_tasks
+            all_parts = [incomplete_files[i:i + part_len] for i in range(0, len(incomplete_files), part_len)]
+            task_args = [(prediction_set, part, args.output_dir) for part in all_parts]
+
+            # Use a pool of workers to process the files in parallel.
+            with Pool() as pool:
+                pool.starmap(annotate, task_args)
+
+        except Exception as e:
+            print(f"Error: {e}")
+
+    # Combine all the processed files into one
+    combined_contents = {}
+    json_path = args.output_json
+
+    # Iterate through json files
+    for file_name in os.listdir(output_dir):
+        if file_name.endswith(".json") and 'results' not in file_name and 'preds' not in file_name:
+            file_path = os.path.join(output_dir, file_name)
+            with open(file_path, "r") as json_file:
+                content = json.load(json_file)
+                combined_contents[file_name[:-5]] = content
+
+    # Write combined content to a json file
+    with open(json_path, "w") as json_file:
+        json.dump(combined_contents, json_file)
+    print("All evaluation completed!")
+
+    # Calculate average score and accuracy
+    score_sum = 0
+    count = 0
+    yes_count = 0
+    no_count = 0
+    for key, result in combined_contents.items():
+        # Computing score
+        count += 1
+        result_0 = result[0]
+        score_match = result_0['score']
+        score = int(score_match)
+        score_sum += score
+
+        # Computing accuracy
+        pred = result_0['pred']
+        pred_lower = pred.lower()
+        if "yes" in pred_lower:
+            yes_count += 1
+        elif "no" in pred_lower:
+            no_count += 1
+        else:
+            raise ValueError(f'For key={key}, there is no yes or no in the answer for result={result}')
+
+    average_score = score_sum / count
+    accuracy = yes_count / (yes_count + no_count)
+    print("Yes count:", yes_count)
+    print("No count:", no_count)
+    print("Accuracy:", accuracy)
+    print("Average score:", average_score)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/quantitative_evaluation/evalute_loc.sh b/quantitative_evaluation/evalute_loc.sh
@@ -0,0 +1,45 @@
+# evalute_loc.sh
+
+export TASK_NAME="loc"
+export DATASET="siq2"
+export SPLIT="val" # never train because why? # train, val, test
+export MODEL_NAME="LLaVA-Lightning-7B-v1-1" # TODO
+export PROJECT_ROOT="${HOME}/vtom"
+export EXPERIMENT_NAME="${DATASET}_${TASK_NAME}_${SPLIT}_${MODEL_NAME}"
+export EXPERIMENT_DIRPATH="${PROJECT_ROOT}/experiments/${EXPERIMENT_NAME}"
+export OUTPUT_PER_VIDEO_DIRPATH="${EXPERIMENT_DIRPATH}/output_per_video"
+export OUTPUT_DIRPATH="${EXPERIMENT_DIRPATH}/output"
+export VIDEO_DIR="${PROJECT_ROOT}/data/${DATASET}/video"
+export LOC_DIRPATH="${PROJECT_ROOT}/data/siq2/loc"
+export PRED_FPATH="preds_${EXPERIMENT_NAME}"
+export RESULTS_FPATH="results_${EXPERIMENT_NAME}.json"
+export GT_WITH_TS_FPATH="${LOC_DIRPATH}/loc_${SPLIT}_with_ts.json"
+export INSTRUCTION_FINETUNING_FPATH="${LOC_DIRPATH}/loc_${SPLIT}_instruction_with_ts.json"
+
+
+mkdir -p "${OUTPUT_PER_VIDEO_DIRPATH}"
+mkdir -p "${OUTPUT_DIRPATH}"
+
+
+python ${PROJECT_ROOT}/scripts/convert_instruction_json_to_training_format_siq2_loc.py \
+        --input_json_file "${LOC_DIRPATH}/loc_${SPLIT}.json" \
+        --output_json_file "${INSTRUCTION_FINETUNING_FPATH}" \
+        --gt_ts_file "${GT_WITH_TS_FPATH}"
+
+
+# Generate video features and predictions
+export NPROC_PER_NODE=2
+export OMP_NUM_THREADS=$(($(nproc) / ${NPROC_PER_NODE}))
+PYTHONPATH="./:$PYTHONPATH" python video_chatgpt/eval/run_inference_loc.py \
+    --model-name "${PROJECT_ROOT}/${MODEL_NAME}" \
+    --video_dir "${VIDEO_DIR}" \
+    --gt_file_qa "${INSTRUCTION_FINETUNING_FPATH}" \
+    --output_dir "${OUTPUT_PER_VIDEO_DIRPATH}" \
+    --output_name "${PRED_FPATH}"
+
+
+# PYTHONPATH="./:$PYTHONPATH" python quantitative_evaluation/evaluate_loc.py \
+#     --pred_path "${OUTPUT_DIRPATH}/${PRED_FPATH}.json" \
+#     --output_dir "${OUTPUT_PER_VIDEO_DIRPATH}" \
+#     --output_json "${OUTPUT_DIRPATH}/${RESULTS_FPATH}" \
+#     --num_tasks 1
diff --git a/video_chatgpt/eval/run_inference_loc.py b/video_chatgpt/eval/run_inference_loc.py
@@ -0,0 +1,93 @@
+from os import makedirs as os_makedirs
+from os.path import join as os_path_join, exists as os_path_exists
+from argparse import ArgumentParser
+from json import load as json_load, dump as json_dump
+from glob import glob
+from warnings import warn
+from tqdm import tqdm
+from torch import device as torch_device, no_grad as torch_no_grad
+from video_chatgpt.eval.model_utils import initialize_model, load_video
+from video_chatgpt.inference import video_chatgpt_infer
+
+
+def parse_args():
+    """
+    Parse command-line arguments.
+    """
+    parser = ArgumentParser()
+
+    # Define the command-line arguments
+    parser.add_argument('--video_dir', help='Directory containing video files.', required=True)
+    parser.add_argument('--gt_file_qa', help='Path to the ground truth file containing question.', required=True)
+    parser.add_argument('--output_dir', help='Directory to save the model results JSON.', required=True)
+    parser.add_argument('--output_name', help='Name of the file for storing results JSON.', required=True)
+    parser.add_argument("--model-name", type=str, required=True)
+    parser.add_argument("--conv-mode", type=str, required=False, default='video-chatgpt_v1')
+    parser.add_argument("--projection_path", type=str, required=False)
+
+    return parser.parse_args()
+
+
+@torch_no_grad()
+def run_inference(args):
+    """
+    Run inference on ActivityNet QA DataSet using the Video-ChatGPT model.
+
+    Args:
+        args: Command-line arguments.
+    """
+    # Initialize the model
+    video_dir = args.video_dir
+    model, vision_tower, tokenizer, image_processor, video_token_len = initialize_model(args.model_name,
+                                                                                        args.projection_path)
+    # Load both ground truth file containing questions and answers
+    with open(args.gt_file_qa) as file:
+        gt_qa = json_load(file)
+
+    # Create the output directory if it doesn't exist
+    output_dir = args.output_dir
+    if not os_path_exists(output_dir):
+        os_makedirs(output_dir)
+
+    output_list = []  # List to store the output results
+    conv_mode = args.conv_mode
+
+    device = torch_device('cuda')
+
+    # Iterate over each sample in the ground truth file
+    for question_dict in tqdm(gt_qa):
+        conversations = question_dict['conversations']
+        question = conversations[0]['value']
+        question_id = question_dict['id']
+        answer = conversations[1]['value']
+
+        sample_set = {'id': question_id, 'question': question, 'answer': answer}
+
+        videos_search_path = os_path_join(video_dir, question_id+'*')
+        videos_match_list = glob(videos_search_path)
+        if not videos_match_list:
+            warn(f'No videos found for {videos_search_path}')
+            continue
+        video_fpath = videos_match_list[0]
+        # Load the video file
+
+        # Check if the video exists
+        video_frames = load_video(video_fpath, device)
+
+        try:
+            # Run inference on the video and add the output to the list
+            output = video_chatgpt_infer(video_frames, question, conv_mode, model, vision_tower,
+                                            tokenizer, image_processor, video_token_len)
+            sample_set['pred'] = output
+            output_list.append(sample_set)
+        except Exception as e:
+            print(f"Error processing video file '{video_fpath}': {e}")
+
+    # Save the output list to a JSON file
+    with open(os_path_join(output_dir, f"{args.output_name}.json"), 'w') as file:
+        json_dump(output_list, file)
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    run_inference(args)