Skip to content

Commit 9bab677

Browse files
authored
Merge branch 'EvolvingLMMs-Lab:main' into add-tinyllava
2 parents dbfb238 + 74facb4 commit 9bab677

11 files changed

+791
-6
lines changed

Diff for: lmms_eval/api/task.py

+43
Original file line numberDiff line numberDiff line change
@@ -778,6 +778,7 @@ def _download_from_youtube(path):
778778
force_unzip = dataset_kwargs.get("force_unzip", False)
779779
cache_path = snapshot_download(repo_id=self.DATASET_PATH, repo_type="dataset", force_download=force_download, etag_timeout=60)
780780
zip_files = glob(os.path.join(cache_path, "**/*.zip"), recursive=True)
781+
tar_files = glob(os.path.join(cache_path, "**/*.tar*"), recursive=True)
781782

782783
def unzip_video_data(zip_file):
783784
import zipfile
@@ -786,10 +787,52 @@ def unzip_video_data(zip_file):
786787
zip_ref.extractall(cache_dir)
787788
eval_logger.info(f"Extracted all files from {zip_file} to {cache_dir}")
788789

790+
def untar_video_data(tar_file):
791+
import tarfile
792+
with tarfile.open(tar_file, "r") as tar_ref:
793+
tar_ref.extractall(cache_dir)
794+
eval_logger.info(f"Extracted all files from {tar_file} to {cache_dir}")
795+
796+
797+
798+
def concat_tar_parts(tar_parts, output_tar):
799+
with open(output_tar, 'wb') as out_tar:
800+
from tqdm import tqdm
801+
for part in tqdm(sorted(tar_parts)):
802+
with open(part, 'rb') as part_file:
803+
out_tar.write(part_file.read())
804+
eval_logger.info(f"Concatenated parts {tar_parts} into {output_tar}")
805+
806+
# Unzip zip files if needed
789807
if force_unzip or (not os.path.exists(cache_dir) and len(zip_files) > 0):
790808
for zip_file in zip_files:
791809
unzip_video_data(zip_file)
792810

811+
# Concatenate and extract tar files if needed
812+
if force_unzip or (not os.path.exists(cache_dir) and len(tar_files) > 0):
813+
tar_parts_dict = {}
814+
815+
# Group tar parts together
816+
for tar_file in tar_files:
817+
base_name = tar_file.split('.tar')[0]
818+
if base_name not in tar_parts_dict:
819+
tar_parts_dict[base_name] = []
820+
tar_parts_dict[base_name].append(tar_file)
821+
822+
823+
# Concatenate and untar split parts
824+
for base_name, parts in tar_parts_dict.items():
825+
eval_logger.info(f"Extracting following tar files: {parts}")
826+
output_tar = base_name + ".tar"
827+
if not os.path.exists(output_tar):
828+
eval_logger.info(f"Start concatenating tar files")
829+
830+
concat_tar_parts(parts, output_tar)
831+
eval_logger.info(f"Finish concatenating tar files")
832+
833+
if not os.path.exists(os.path.join(cache_dir, os.path.basename(base_name))):
834+
untar_video_data(output_tar)
835+
793836
accelerator.wait_for_everyone()
794837
dataset_kwargs.pop("cache_dir")
795838
dataset_kwargs.pop("video")

Diff for: lmms_eval/models/llava_vid.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -96,6 +96,7 @@ def __init__(
9696
self.mm_spatial_pool_out_channels = int(mm_spatial_pool_out_channels)
9797
self.mm_spatial_pool_mode = mm_spatial_pool_mode
9898
self.max_frames_num = int(max_frames_num)
99+
print(self.max_frames_num)
99100
if self.overwrite == True:
100101
overwrite_config = {}
101102
overwrite_config["mm_resampler_type"] = self.mm_resampler_type
@@ -416,4 +417,4 @@ def generate_until(self, requests) -> List[str]:
416417
outputs = self.tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0].strip()
417418
res.append(outputs)
418419
pbar.update(1)
419-
return res
420+
return res

Diff for: lmms_eval/models/phi3v.py

+11-5
Original file line numberDiff line numberDiff line change
@@ -185,9 +185,16 @@ def _collate(x):
185185
contexts = list(contexts)
186186
for i in range(len(contexts)):
187187
if "<image>" in contexts[i]:
188-
query = contexts[i].replace("<image>", "<|image_1|>")
188+
query = "" + contexts[i]
189+
img_placeholder_count = 1
190+
while "<image>" in query:
191+
query = query.replace("<image>", f"<|image_{img_placeholder_count}|>", 1)
192+
img_placeholder_count += 1
189193
else:
190-
query = f"<|image_1|>\n{contexts[i]}"
194+
query = ""
195+
for placeholder_id in range(len(visuals)):
196+
query += f"<|image_{placeholder_id+1}|>\n"
197+
query += contexts[i]
191198
messages = [
192199
{"role": "user", "content": query}
193200
]
@@ -196,12 +203,11 @@ def _collate(x):
196203
tokenize=False,
197204
add_generation_prompt=True)
198205
assert len(contexts) == 1
199-
# We always pass a single image given that the model only accepts one image (as of 5/21/24).
206+
#
200207
context = contexts[0]
201-
pil_image = visuals[0]
202208
input_ids = self._processor(
203209
text=context,
204-
images=[pil_image],
210+
images=visuals,
205211
return_tensors="pt").to(self._device, self.model.dtype)
206212
# Setting default parameters.
207213
if "max_new_tokens" not in gen_kwargs:
+29
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
dataset_path: longvideobench/LongVideoBench
2+
dataset_kwargs:
3+
token: True
4+
cache_dir: longvideobench
5+
video: True
6+
force_download: False
7+
local_files_only: False
8+
# From_YouTube: True
9+
task: longvideobench_val_i
10+
test_split: validation
11+
doc_to_visual: !function utils.longvideobench_doc_to_visual_i
12+
doc_to_text: !function utils.longvideobench_doc_to_text
13+
doc_to_target: "correct_choice"
14+
generation_kwargs:
15+
max_new_tokens: 32
16+
temperature: 0
17+
do_sample: False
18+
process_results: !function utils.longvideobench_process_results
19+
metric_list:
20+
- metric: lvb_acc
21+
aggregation: !function utils.longvideobench_aggregate_results
22+
higher_is_better: true
23+
24+
model_specific_prompt_kwargs:
25+
default:
26+
pre_prompt: ""
27+
post_prompt: "Answer with the option's letter from the given choices directly.\n"
28+
insert_interleave_subtitles: True
29+
+28
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
dataset_path: longvideobench/LongVideoBench
2+
dataset_kwargs:
3+
token: True
4+
cache_dir: longvideobench
5+
video: True
6+
force_download: False
7+
local_files_only: False
8+
# From_YouTube: True
9+
task: longvideobench_val_v
10+
test_split: validation
11+
doc_to_visual: !function utils.longvideobench_doc_to_visual_v
12+
doc_to_text: !function utils.longvideobench_doc_to_text
13+
doc_to_target: "correct_choice"
14+
generation_kwargs:
15+
max_new_tokens: 32
16+
temperature: 0
17+
do_sample: False
18+
process_results: !function utils.longvideobench_process_results
19+
metric_list:
20+
- metric: lvb_acc
21+
aggregation: !function utils.longvideobench_aggregate_results
22+
higher_is_better: true
23+
24+
model_specific_prompt_kwargs:
25+
default:
26+
pre_prompt: ""
27+
post_prompt: "Answer with the option's letter from the given choices directly.\n"
28+

0 commit comments

Comments
 (0)