EvolvingLMMs-Lab
diff --git a/Diff for: ‎lmms_eval/api/task.py
+43 b/Diff for: ‎lmms_eval/api/task.py
+43
diff --git a/Diff for: ‎lmms_eval/models/llava_vid.py
+2-1 b/Diff for: ‎lmms_eval/models/llava_vid.py
+2-1
diff --git a/Diff for: ‎lmms_eval/models/phi3v.py
+11-5 b/Diff for: ‎lmms_eval/models/phi3v.py
+11-5
diff --git a/Diff for: ‎lmms_eval/tasks/longvideobench/longvideobench_val_i.yaml
+29 b/Diff for: ‎lmms_eval/tasks/longvideobench/longvideobench_val_i.yaml
+29
diff --git a/Diff for: ‎lmms_eval/tasks/longvideobench/longvideobench_val_v.yaml
+28 b/Diff for: ‎lmms_eval/tasks/longvideobench/longvideobench_val_v.yaml
+28
@@ -778,6 +778,7 @@ def _download_from_youtube(path):
                     force_unzip = dataset_kwargs.get("force_unzip", False)
                     cache_path = snapshot_download(repo_id=self.DATASET_PATH, repo_type="dataset", force_download=force_download, etag_timeout=60)
                     zip_files = glob(os.path.join(cache_path, "**/*.zip"), recursive=True)
+                    tar_files = glob(os.path.join(cache_path, "**/*.tar*"), recursive=True)
 
                     def unzip_video_data(zip_file):
                         import zipfile
@@ -786,10 +787,52 @@ def unzip_video_data(zip_file):
                             zip_ref.extractall(cache_dir)
                             eval_logger.info(f"Extracted all files from {zip_file} to {cache_dir}")
 
+                    def untar_video_data(tar_file):
+                        import tarfile
+                        with tarfile.open(tar_file, "r") as tar_ref:
+                            tar_ref.extractall(cache_dir)
+                            eval_logger.info(f"Extracted all files from {tar_file} to {cache_dir}")
+
+
+    
+                    def concat_tar_parts(tar_parts, output_tar):
+                        with open(output_tar, 'wb') as out_tar:
+                            from tqdm import tqdm
+                            for part in tqdm(sorted(tar_parts)):
+                                with open(part, 'rb') as part_file:
+                                    out_tar.write(part_file.read())
+                        eval_logger.info(f"Concatenated parts {tar_parts} into {output_tar}")
+
+                    # Unzip zip files if needed
                     if force_unzip or (not os.path.exists(cache_dir) and len(zip_files) > 0):
                         for zip_file in zip_files:
                             unzip_video_data(zip_file)
 
+                    # Concatenate and extract tar files if needed
+                    if force_unzip or (not os.path.exists(cache_dir) and len(tar_files) > 0):
+                        tar_parts_dict = {}
+                        
+                        # Group tar parts together
+                        for tar_file in tar_files:
+                            base_name = tar_file.split('.tar')[0]
+                            if base_name not in tar_parts_dict:
+                                tar_parts_dict[base_name] = []
+                            tar_parts_dict[base_name].append(tar_file)
+
+                        
+                        # Concatenate and untar split parts
+                        for base_name, parts in tar_parts_dict.items():
+                            eval_logger.info(f"Extracting following tar files: {parts}")
+                            output_tar = base_name + ".tar"
+                            if not os.path.exists(output_tar):
+                                eval_logger.info(f"Start concatenating tar files")
+
+                                concat_tar_parts(parts, output_tar)
+                                eval_logger.info(f"Finish concatenating tar files")
+
+                            if not os.path.exists(os.path.join(cache_dir, os.path.basename(base_name))):
+                                untar_video_data(output_tar)
+
                 accelerator.wait_for_everyone()
                 dataset_kwargs.pop("cache_dir")
                 dataset_kwargs.pop("video")
 
@@ -96,6 +96,7 @@ def __init__(
         self.mm_spatial_pool_out_channels = int(mm_spatial_pool_out_channels)
         self.mm_spatial_pool_mode = mm_spatial_pool_mode
         self.max_frames_num = int(max_frames_num)
+        print(self.max_frames_num)
         if self.overwrite == True:
             overwrite_config = {}
             overwrite_config["mm_resampler_type"] = self.mm_resampler_type
@@ -416,4 +417,4 @@ def generate_until(self, requests) -> List[str]:
             outputs = self.tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0].strip()
             res.append(outputs)
             pbar.update(1)
-        return res
+        return res
@@ -185,9 +185,16 @@ def _collate(x):
                 contexts = list(contexts)
             for i in range(len(contexts)):
                 if "<image>" in contexts[i]:
-                    query = contexts[i].replace("<image>", "<|image_1|>")
+                    query = "" + contexts[i]
+                    img_placeholder_count = 1
+                    while "<image>" in query:
+                        query = query.replace("<image>", f"<|image_{img_placeholder_count}|>", 1)
+                        img_placeholder_count += 1
                 else:
-                    query = f"<|image_1|>\n{contexts[i]}"
+                    query = ""
+                    for placeholder_id in range(len(visuals)):
+                        query += f"<|image_{placeholder_id+1}|>\n"
+                    query += contexts[i]
                 messages = [
                     {"role": "user", "content": query}
                 ]
@@ -196,12 +203,11 @@ def _collate(x):
                     tokenize=False,
                     add_generation_prompt=True)
             assert len(contexts) == 1
-            # We always pass a single image given that the model only accepts one image (as of 5/21/24).
+            # 
             context = contexts[0]
-            pil_image = visuals[0]
             input_ids = self._processor(
                 text=context,
-                images=[pil_image],
+                images=visuals,
                 return_tensors="pt").to(self._device, self.model.dtype)
             # Setting default parameters.
             if "max_new_tokens" not in gen_kwargs:
 
@@ -0,0 +1,29 @@
+dataset_path: longvideobench/LongVideoBench
+dataset_kwargs:
+  token: True
+  cache_dir: longvideobench
+  video: True
+  force_download: False
+  local_files_only: False
+  # From_YouTube: True
+task: longvideobench_val_i
+test_split: validation
+doc_to_visual: !function utils.longvideobench_doc_to_visual_i
+doc_to_text: !function utils.longvideobench_doc_to_text
+doc_to_target: "correct_choice"
+generation_kwargs:
+  max_new_tokens: 32
+  temperature: 0
+  do_sample: False
+process_results: !function utils.longvideobench_process_results
+metric_list:
+  - metric: lvb_acc
+    aggregation: !function utils.longvideobench_aggregate_results
+    higher_is_better: true
+
+model_specific_prompt_kwargs:
+  default:
+    pre_prompt: ""
+    post_prompt: "Answer with the option's letter from the given choices directly.\n"
+    insert_interleave_subtitles: True
+    
@@ -0,0 +1,28 @@
+dataset_path: longvideobench/LongVideoBench
+dataset_kwargs:
+  token: True
+  cache_dir: longvideobench
+  video: True
+  force_download: False
+  local_files_only: False
+  # From_YouTube: True
+task: longvideobench_val_v
+test_split: validation
+doc_to_visual: !function utils.longvideobench_doc_to_visual_v
+doc_to_text: !function utils.longvideobench_doc_to_text
+doc_to_target: "correct_choice"
+generation_kwargs:
+  max_new_tokens: 32
+  temperature: 0
+  do_sample: False
+process_results: !function utils.longvideobench_process_results
+metric_list:
+  - metric: lvb_acc
+    aggregation: !function utils.longvideobench_aggregate_results
+    higher_is_better: true
+
+model_specific_prompt_kwargs:
+  default:
+    pre_prompt: ""
+    post_prompt: "Answer with the option's letter from the given choices directly.\n"
+