Skip to content

Commit 511b625

Browse files
committed
2 parents 22a4958 + 050b2c3 commit 511b625

File tree

449 files changed

+22285
-582
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

449 files changed

+22285
-582
lines changed

Diff for: .github/issue_template.md

100644100755
File mode changed.

Diff for: .github/pull_request_template.md

100644100755
File mode changed.

Diff for: .github/workflows/black.yml

100644100755
File mode changed.

Diff for: .gitignore

100644100755
+13
Original file line numberDiff line numberDiff line change
@@ -24,3 +24,16 @@ submissions/
2424
lmms_eval/tasks/hallusion_bench/hallusion_output_vs_model.json
2525
lmms_eval/tasks/hallusion_bench/hallusion_output_vd_model.json
2626
zk.log
27+
cache_dir
28+
ckpt
29+
pretrained/
30+
LLaVA/
31+
*logs
32+
temp/
33+
InternVL/
34+
logs/
35+
data/
36+
llava-video/
37+
Video-MME/
38+
VATEX/
39+
lmms_eval/tasks/vatex/__pycache__/utils.cpython-310.pyc

Diff for: .pre-commit-config.yaml

100644100755
File mode changed.

Diff for: LICENSE

+56
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
# For the main pipeline structure-related code, we maintain the original license provided with lm-evaluation-harness, which is the MIT License.
2+
3+
MIT License
4+
5+
Copyright (c) 2024 LMMs-Lab
6+
7+
Permission is hereby granted, free of charge, to any person obtaining a copy
8+
of this software and associated documentation files (the "Software"), to deal
9+
in the Software without restriction, including without limitation the rights
10+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11+
copies of the Software, and to permit persons to whom the Software is
12+
furnished to do so, subject to the following conditions:
13+
14+
The above copyright notice and this permission notice shall be included in all
15+
copies or substantial portions of the Software.
16+
17+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
20+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
23+
SOFTWARE.
24+
25+
# For the multimodal models and datasets that we have added (defined as code in the lmms_eval/tasks and lmms_eval/models folders), we apply the Apache License.
26+
27+
Apache 2.0 License
28+
29+
Copyright (c) 2024 LMMs-Lab
30+
31+
Licensed under the Apache License, Version 2.0 (the "License");
32+
you may not use this file except in compliance with the License.
33+
You may obtain a copy of the License at
34+
35+
http://www.apache.org/licenses/LICENSE-2.0
36+
37+
Unless required by applicable law or agreed to in writing, software
38+
distributed under the License is distributed on an "AS IS" BASIS,
39+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
40+
See the License for the specific language governing permissions and
41+
limitations under the License.
42+
43+
When modifying the code, please include the following information about the original lmms-eval source:
44+
# Adopted from lmms-eval from https://github.com/EvolvingLMMs-Lab/lmms-eval. Below is the original copyright:
45+
#
46+
# Licensed under the Apache License, Version 2.0 (the "License");
47+
# you may not use this file except in compliance with the License.
48+
# You may obtain a copy of the License at
49+
#
50+
# http://www.apache.org/licenses/LICENSE-2.0
51+
#
52+
# Unless required by applicable law or agreed to in writing, software
53+
# distributed under the License is distributed on an "AS IS" BASIS,
54+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
55+
# See the License for the specific language governing permissions and
56+
# limitations under the License.

Diff for: README.md

100644100755
+172-196
Large diffs are not rendered by default.

Diff for: docs/README.md

100644100755
File mode changed.

Diff for: docs/commands.md

100644100755
File mode changed.

Diff for: docs/current_tasks.md

+122
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,122 @@
1+
# Current Tasks
2+
3+
> () indicates the task name in the lmms_eval. The task name is also used to specify the dataset in the configuration file.
4+
> The following is manually updated documentation. You could use `lmms_eval task --list` to list all supported tasks and their task names.
5+
6+
- AI2D (ai2d)
7+
- ChartQA (chartqa)
8+
- CMMMU (cmmmu)
9+
- CMMMU Validation (cmmmu_val)
10+
- CMMMU Test (cmmmu_test)
11+
- COCO Caption (coco_cap)
12+
- COCO 2014 Caption (coco2014_cap)
13+
- COCO 2014 Caption Validation (coco2014_cap_val)
14+
- COCO 2014 Caption Test (coco2014_cap_test)
15+
- COCO 2017 Caption (coco2017_cap)
16+
- COCO 2017 Caption MiniVal (coco2017_cap_val)
17+
- COCO 2017 Caption MiniTest (coco2017_cap_test)
18+
- [ConBench](https://github.com/foundation-multimodal-models/ConBench) (conbench)
19+
- DOCVQA (docvqa)
20+
- DOCVQA Validation (docvqa_val)
21+
- DOCVQA Test (docvqa_test)
22+
- Ferret (ferret)
23+
- Flickr30K (flickr30k)
24+
- Ferret Test (ferret_test)
25+
- GQA (gqa)
26+
- HallusionBenchmark (hallusion_bench_image)
27+
- Infographic VQA (info_vqa)
28+
- Infographic VQA Validation (info_vqa_val)
29+
- Infographic VQA Test (info_vqa_test)
30+
- LLaVA-Bench (llava_in_the_wild)
31+
- LLaVA-Bench-COCO (llava_bench_coco)
32+
- MathVerse (mathverse)
33+
- MathVerse Text Dominant (mathverse_testmini_text_dominant)
34+
- MathVerse Text Only (mathverse_testmini_text_only)
35+
- MathVerse Text Lite (mathverse_testmini_text_lite)
36+
- MathVerse Vision Dominant (mathverse_testmini_vision_dominant)
37+
- MathVerse Vision Intensive (mathverse_testmini_vision_intensive)
38+
- MathVerse Vision Only (mathverse_testmini_vision_only)
39+
- MathVista (mathvista)
40+
- MathVista Validation (mathvista_testmini)
41+
- MathVista Test (mathvista_test)
42+
- MMBench (mmbench)
43+
- MMBench English (mmbench_en)
44+
- MMBench English Dev (mmbench_en_dev)
45+
- MMBench English Test (mmbench_en_test)
46+
- MMBench Chinese (mmbench_cn)
47+
- MMBench Chinese Dev (mmbench_cn_dev)
48+
- MMBench Chinese Test (mmbench_cn_test)
49+
- MME (mme)
50+
- MMMU (mmmu)
51+
- MMMU Validation (mmmu_val)
52+
- MMMU Test (mmmu_test)
53+
- MMUPD (mmupd)
54+
- MMUPD Base (mmupd_base)
55+
- MMAAD Base (mmaad_base)
56+
- MMIASD Base (mmiasd_base)
57+
- MMIVQD Base (mmivqd_base)
58+
- MMUPD Option (mmupd_option)
59+
- MMAAD Option (mmaad_option)
60+
- MMIASD Option (mmiasd_option)
61+
- MMIVQD Option (mmivqd_option)
62+
- MMUPD Instruction (mmupd_instruction)
63+
- MMAAD Instruction (mmaad_instruction)
64+
- MMIASD Instruction (mmiasd_instruction)
65+
- MMIVQD Instruction (mmivqd_instruction)
66+
- MMVet (mmvet)
67+
- Multi-DocVQA (multidocvqa)
68+
- Multi-DocVQA Validation (multidocvqa_val)
69+
- Multi-DocVQA Test (multidocvqa_test)
70+
- NoCaps (nocaps)
71+
- NoCaps Validation (nocaps_val)
72+
- NoCaps Test (nocaps_test)
73+
- OKVQA (ok_vqa)
74+
- OKVQA Validation 2014 (ok_vqa_val2014)
75+
- POPE (pope)
76+
- RefCOCO (refcoco)
77+
- refcoco_seg_test
78+
- refcoco_seg_val
79+
- refcoco_seg_testA
80+
- refcoco_seg_testB
81+
- refcoco_bbox_test
82+
- refcoco_bbox_val
83+
- refcoco_bbox_testA
84+
- refcoco_bbox_testB
85+
- RefCOCO+ (refcoco+)
86+
- refcoco+_seg
87+
- refcoco+_seg_val
88+
- refcoco+_seg_testA
89+
- refcoco+_seg_testB
90+
- refcoco+_bbox
91+
- refcoco+_bbox_val
92+
- refcoco+_bbox_testA
93+
- refcoco+_bbox_testB
94+
- RefCOCOg (refcocog)
95+
- refcocog_seg_test
96+
- refcocog_seg_val
97+
- refcocog_bbox_test
98+
- refcocog_bbox_val
99+
- ScienceQA (scienceqa_full)
100+
- ScienceQA Full (scienceqa)
101+
- ScienceQA IMG (scienceqa_img)
102+
- ScreenSpot (screenspot)
103+
- ScreenSpot REC / Grounding (screenspot_rec)
104+
- ScreenSpot REG / Instruction Generation (screenspot_reg)
105+
- SeedBench (seedbench)
106+
- SeedBench 2 (seedbench_2)
107+
- ST-VQA (stvqa)
108+
- TextCaps (textcaps)
109+
- TextCaps Validation (textcaps_val)
110+
- TextCaps Test (textcaps_test)
111+
- TextVQA (textvqa)
112+
- TextVQA Validation (textvqa_val)
113+
- TextVQA Test (textvqa_test)
114+
- VizWizVQA (vizwiz_vqa)
115+
- VizWizVQA Validation (vizwiz_vqa_val)
116+
- VizWizVQA Test (vizwiz_vqa_test)
117+
- VQAv2 (vqav2)
118+
- VQAv2 Validation (vqav2_val)
119+
- VQAv2 Test (vqav2_test)
120+
- WebSRC (websrc)
121+
- WebSRC Validation (websrc_val)
122+
- WebSRC Test (websrc_test)

Diff for: docs/model_guide.md

100644100755
File mode changed.

Diff for: docs/task_guide.md

100644100755
+1-1
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ doc_to_target: "answer"
2727
generation_kwargs:
2828
max_new_tokens: 16
2929
temperature: 0
30-
top_p: 0
30+
top_p: 1.0
3131
num_beams: 1
3232
do_sample: false
3333
# The return value of process_results will be used by metrics

Diff for: example_eval.yaml

-15
This file was deleted.

Diff for: lmms_eval/__init__.py

100644100755
File mode changed.

Diff for: lmms_eval/__main__.py

100644100755
+31-12
Original file line numberDiff line numberDiff line change
@@ -106,9 +106,16 @@ def parse_eval_args() -> argparse.Namespace:
106106
parser.add_argument(
107107
"--log_samples_suffix",
108108
type=str,
109-
default="",
109+
default="model_outputs",
110110
help="Specify a suffix for the log_samples file name.",
111111
)
112+
parser.add_argument(
113+
"--predict_only",
114+
"-x",
115+
action="store_true",
116+
default=False,
117+
help="Use with --log_samples. Only model outputs will be saved and metrics will not be evaluated.",
118+
)
112119
parser.add_argument(
113120
"--show_config",
114121
action="store_true",
@@ -228,6 +235,10 @@ def cli_evaluate_single(args: Union[argparse.Namespace, None] = None) -> None:
228235

229236
initialize_tasks(args.verbosity)
230237

238+
if args.predict_only:
239+
args.log_samples = True
240+
if (args.log_samples or args.predict_only) and not args.output_path:
241+
raise ValueError("Specify --output_path if providing --log_samples or --predict_only")
231242
if args.limit:
232243
eval_logger.warning(" --limit SHOULD ONLY BE USED FOR TESTING." "REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT.")
233244
if args.include_path is not None:
@@ -244,14 +255,17 @@ def cli_evaluate_single(args: Union[argparse.Namespace, None] = None) -> None:
244255
"\n" + "=" * 70 + "\n" + "\n\tYou are trying to check all the numbers in each task." + "\n\tThis action will download the complete dataset." + "\n\tIf the results are not clear initially, call this again." + "\n\n" + "=" * 70
245256
)
246257
eval_logger.info(log_message)
247-
task_dict = get_task_dict([task for task in sorted(ALL_TASKS)], model_name="llava")
248-
for task_name in task_dict.keys():
249-
task_obj = task_dict[task_name]
250-
if type(task_obj) == tuple:
251-
group, task_obj = task_obj
252-
if task_obj is None:
253-
continue
254-
eval_logger.info(f"\nTask : {task_obj.config.task}\n - #num : {len(task_obj.test_docs()) if task_obj.has_test_docs() else len(task_obj.validation_docs())}")
258+
for task_name in sorted(ALL_TASKS):
259+
try:
260+
task_dict = get_task_dict([task_name], model_name="llava")
261+
task_obj = task_dict[task_name]
262+
if type(task_obj) == tuple:
263+
group, task_obj = task_obj
264+
if task_obj is None:
265+
continue
266+
eval_logger.info(f"\nTask : {task_obj.config.task}\n - #num : {len(task_obj.test_docs()) if task_obj.has_test_docs() else len(task_obj.validation_docs())}")
267+
except Exception as e:
268+
eval_logger.debug(f"\nTask : {task_name} fail to load \n Exception : \n {e}")
255269
sys.exit()
256270
else:
257271
tasks_list = args.tasks.split(",")
@@ -271,6 +285,10 @@ def cli_evaluate_single(args: Union[argparse.Namespace, None] = None) -> None:
271285
# set datetime before evaluation
272286
datetime_str = utils.get_datetime_str(timezone=args.timezone)
273287
if args.output_path:
288+
if args.log_samples_suffix and len(args.log_samples_suffix) > 15:
289+
eval_logger.warning("The suffix for log_samples is too long. It is recommended to keep it under 15 characters.")
290+
args.log_samples_suffix = args.log_samples_suffix[:5] + "..." + args.log_samples_suffix[-5:]
291+
274292
hash_input = f"{args.model_args}".encode("utf-8")
275293
hash_output = hashlib.sha256(hash_input).hexdigest()[:6]
276294
path = Path(args.output_path)
@@ -293,6 +311,7 @@ def cli_evaluate_single(args: Union[argparse.Namespace, None] = None) -> None:
293311
log_samples=args.log_samples,
294312
gen_kwargs=args.gen_kwargs,
295313
cli_args=args,
314+
predict_only=args.predict_only,
296315
)
297316

298317
if results is not None:
@@ -315,9 +334,9 @@ def cli_evaluate_single(args: Union[argparse.Namespace, None] = None) -> None:
315334
for task_name, config in results["configs"].items():
316335
filename = args.output_path.joinpath(f"{task_name}.json")
317336
# Structure the data with 'args' and 'logs' keys
318-
data_to_dump = {"args": vars(args), "model_configs": config, "logs": sorted(samples[task_name], key=lambda x: x["doc_id"])} # Convert Namespace to dict
319-
samples_dumped = json.dumps(data_to_dump, indent=4, default=_handle_non_serializable)
320-
filename.open("w").write(samples_dumped)
337+
data_to_dump = {"args": vars(args), "model_configs": config, "logs": sorted(samples[task_name], key=lambda x: x["doc_id"]), "time": datetime_str}
338+
samples_dumped = json.dumps(data_to_dump, indent=4, default=_handle_non_serializable, ensure_ascii=False)
339+
filename.open("w", encoding="utf-8").write(samples_dumped)
321340
eval_logger.info(f"Saved samples to {filename}")
322341

323342
return results, samples

Diff for: lmms_eval/api/__init__.py

100644100755
File mode changed.

Diff for: lmms_eval/api/filter.py

100644100755
File mode changed.

Diff for: lmms_eval/api/instance.py

100644100755
File mode changed.

Diff for: lmms_eval/api/metrics.py

100644100755
+15
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,11 @@
1616

1717

1818
# Register Aggregations First
19+
@register_aggregation("bypass")
20+
def bypass_agg(arr):
21+
return 999
22+
23+
1924
@register_aggregation("mean")
2025
def mean(arr):
2126
return sum(arr) / len(arr)
@@ -226,6 +231,16 @@ def mean_stderr(arr):
226231
return sample_stddev(arr) / math.sqrt(len(arr))
227232

228233

234+
@register_metric(
235+
metric="bypass",
236+
higher_is_better=True,
237+
output_type=["loglikelihood", "multiple_choice", "generate_until"],
238+
aggregation="bypass",
239+
)
240+
def bypass(items):
241+
return items
242+
243+
229244
@register_metric(
230245
metric="mcc",
231246
higher_is_better=True,

Diff for: lmms_eval/api/model.py

100644100755
File mode changed.

Diff for: lmms_eval/api/registry.py

100644100755
+18
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
11
from lmms_eval.api.model import lmms
22

3+
from typing import Callable, Dict
34
import logging
5+
import evaluate as hf_evaluate
46

57
eval_logger = logging.getLogger("lmms-eval")
68

@@ -104,6 +106,22 @@ def decorate(fn):
104106
return decorate
105107

106108

109+
def get_metric(name: str, hf_evaluate_metric=False) -> Callable:
110+
if not hf_evaluate_metric:
111+
if name in METRIC_REGISTRY:
112+
return METRIC_REGISTRY[name]
113+
else:
114+
eval_logger.warning(f"Could not find registered metric '{name}' in lm-eval, searching in HF Evaluate library...")
115+
116+
try:
117+
metric_object = hf_evaluate.load(name)
118+
return metric_object.compute
119+
except Exception:
120+
eval_logger.error(
121+
f"{name} not found in the evaluate library! Please check https://huggingface.co/evaluate-metric",
122+
)
123+
124+
107125
def register_aggregation(name):
108126
def decorate(fn):
109127
assert name not in AGGREGATION_REGISTRY, f"aggregation named '{name}' conflicts with existing registered aggregation!"

Diff for: lmms_eval/api/samplers.py

100644100755
File mode changed.

0 commit comments

Comments
 (0)