EvolvingLMMs-Lab
diff --git a/Diff for: ‎README.md
+28-12 b/Diff for: ‎README.md
+28-12
diff --git a/Diff for: ‎lmms_eval/api/task.py
+27-12 b/Diff for: ‎lmms_eval/api/task.py
+27-12
diff --git a/Diff for: ‎lmms_eval/filters/__init__.py
+2-1 b/Diff for: ‎lmms_eval/filters/__init__.py
+2-1
diff --git a/Diff for: ‎lmms_eval/filters/extraction.py
+170-16 b/Diff for: ‎lmms_eval/filters/extraction.py
+170-16
diff --git a/Diff for: ‎lmms_eval/models/__init__.py
+2 b/Diff for: ‎lmms_eval/models/__init__.py
+2
@@ -201,14 +201,21 @@ We also provide the raw data exported from Weights & Biases for the detailed res
   - OKVQA Validation 2014 (ok_vqa_val2014)
 - POPE (pope)
 - RefCOCO (refcoco)
-    - refcoco_seg_test
-    - refcoco_seg_val
-    - refcoco_seg_testA
-    - refcoco_seg_testB
-    - refcoco_bbox_test
-    - refcoco_bbox_val
-    - refcoco_bbox_testA
-    - refcoco_bbox_testB
+    - refcoco_seg
+      - refcoco_seg_test
+      - refcoco_seg_val
+      - refcoco_seg_testA
+      - refcoco_seg_testB
+    - refcoco_bbox
+      - refcoco_bbox_test
+      - refcoco_bbox_val
+      - refcoco_bbox_testA
+      - refcoco_bbox_testB
+    - refcoco_bbox_rec
+      - refcoco_bbox_rec_test 
+      - refcoco_bbox_rec_val
+      - refcoco_bbox_rec_testA
+      - refcoco_bbox_rec_testB
 - RefCOCO+ (refcoco+)
     - refcoco+_seg
         - refcoco+_seg_val
@@ -218,11 +225,20 @@ We also provide the raw data exported from Weights & Biases for the detailed res
         - refcoco+_bbox_val
         - refcoco+_bbox_testA
         - refcoco+_bbox_testB
+    - refcoco+_bbox_rec
+        - refcoco+_bbox_rec_val
+        - refcoco+_bbox_rec_testA
+        - refcoco+_bbox_rec_testB
 - RefCOCOg (refcocog)
-    - refcocog_seg_test
-    - refcocog_seg_val
-    - refcocog_bbox_test
-    - refcocog_bbox_val
+    - refcocog_seg
+      - refcocog_seg_test
+      - refcocog_seg_val
+    - refcocog_bbox
+      - refcocog_bbox_test
+      - refcocog_bbox_val
+    - refcocog_bbox_rec
+      - refcocog_bbox_rec_test 
+      - refcocog_bbox_rec_val
 - ScienceQA (scienceqa_full)
   - ScienceQA Full (scienceqa)
   - ScienceQA IMG (scienceqa_img)
 
@@ -678,6 +678,22 @@ def _prepare_metric_and_aggregation(self):
 
     @retry(stop=stop_after_attempt(5), wait=wait_fixed(2))
     def download(self, dataset_kwargs=None) -> None:
+        # If the dataset is a video dataset,
+        # Recursively search whether their is a zip and unzip it to the huggingface home
+        if dataset_kwargs is not None and "video" in dataset_kwargs and dataset_kwargs["video"]:
+            hf_home = os.environ["HF_HOME"]
+            cache_dir = dataset_kwargs["cache_dir"]
+            dataset_kwargs.pop("cache_dir")
+            cache_dir = os.path.join(hf_home, cache_dir)
+            cache_path = snapshot_download(repo_id=self.DATASET_PATH, repo_type="dataset")
+            zip_files = glob(os.path.join(cache_path, "**/*.zip"), recursive=True)
+            if not os.path.exists(cache_dir):
+                for zip_file in zip_files:
+                    shutil.unpack_archive(zip_file, cache_dir)
+            builder_script = dataset_kwargs["builder_script"]
+            self.DATASET_PATH = os.path.join(cache_path, builder_script)
+            dataset_kwargs.pop("video")
+            dataset_kwargs.pop("builder_script")
         download_config = DownloadConfig()
         download_config.max_retries = dataset_kwargs.get("max_retries", 3) if dataset_kwargs is not None else 3
         download_config.num_proc = dataset_kwargs.get("num_proc", 8) if dataset_kwargs is not None else 8
@@ -687,12 +703,15 @@ def download(self, dataset_kwargs=None) -> None:
             download_mode=datasets.DownloadMode.REUSE_DATASET_IF_EXISTS,
             **dataset_kwargs if dataset_kwargs is not None else {},
         )
-        self.dataset_no_image = datasets.load_dataset(
-            path=self.DATASET_PATH,
-            name=self.DATASET_NAME,
-            download_mode=datasets.DownloadMode.REUSE_DATASET_IF_EXISTS,
-            **dataset_kwargs if dataset_kwargs is not None else {},
-        )
+        if self.config.process_docs is not None:
+            for split in self.dataset:
+                if split in [
+                    self.config.training_split, self.config.validation_split, self.config.test_split, self.config.fewshot_split
+                ]:
+                    self.dataset[split] = self.config.process_docs(self.dataset[split])
+
+        # copy dataset, remove image features
+        self.dataset_no_image = self.dataset.copy()
         for doc_name in self.dataset_no_image:
             remove_cols = []
             features = self.dataset_no_image[doc_name].features
@@ -725,20 +744,14 @@ def has_test_docs(self) -> bool:
 
     def training_docs(self) -> datasets.Dataset:
         if self.has_training_docs():
-            if self.config.process_docs is not None:
-                return self.config.process_docs(self.dataset[self.config.training_split])
             return self.dataset[self.config.training_split]
 
     def validation_docs(self) -> datasets.Dataset:
         if self.has_validation_docs():
-            if self.config.process_docs is not None:
-                return self.config.process_docs(self.dataset[self.config.validation_split])
             return self.dataset[self.config.validation_split]
 
     def test_docs(self) -> datasets.Dataset:
         if self.has_test_docs():
-            if self.config.process_docs is not None:
-                return self.config.process_docs(self.dataset[self.config.test_split])
             return self.dataset[self.config.test_split]
 
     def fewshot_docs(self):
@@ -973,6 +986,8 @@ def construct_requests(self, doc_id: int, ctx: str, **kwargs) -> Union[List[Inst
         return Instance(request_type=self.OUTPUT_TYPE, arguments=arguments, idx=0, **kwargs)
 
     def process_results(self, doc, results):
+        if self.OUTPUT_TYPE == "generate_until":
+            results[0] = results[0].strip()
         if callable(self.config.process_results):
             return self.config.process_results(doc, results)
 
 
@@ -1,4 +1,4 @@
-from lmms_eval.api.filter import FilterEnsemble
+from lmms_eval.api.filter import FilterEnsemble, Filter
 from . import selection
 from . import extraction
 from . import transformation
@@ -13,6 +13,7 @@
     "lowercase": transformation.LowercaseFilter,
     "uppercase": transformation.UppercaseFilter,
     "map": transformation.MapFilter,
+    "multi_choice_regex": extraction.MultiChoiceRegexFilter,
     # TODO: implement this filter. either it should take in an arbitrary "scoring"/reward function
     # that takes an input and returns a scalar and then should select the max reward,
     # or should implement different filters for different ways of handling a reward model's inference.
 
@@ -1,18 +1,47 @@
 import re
-
+import sys
+import unicodedata
 from lmms_eval.api.filter import Filter
 
 
+class WhitespaceFilter(Filter):
+    """ """
+
+    def __init__(self) -> None:
+        pass
+
+    def apply(self, resps, docs):
+        def filter_set(inst):
+            filtered_resp = []
+            for resp in inst:
+                if resp.startswith(" "):
+                    resp = resp[1:]
+
+                filtered_resp.append(resp)
+
+            return filtered_resp
+
+        filtered_resps = [filter_set(resp) for resp in resps]
+
+        return filtered_resps
+
+
 class RegexFilter(Filter):
     """ """
 
-    def __init__(self, regex_pattern: str = r"#### (\-?[0-9\.\,]+)", fallback: str = "[invalid]") -> None:
+    def __init__(
+        self,
+        regex_pattern: str = r"#### (\-?[0-9\.\,]+)",
+        group_select=0,
+        fallback: str = "[invalid]",
+    ) -> None:
         """
         pass a string `regex` to run `re.compile(r"regex")` on.
         `fallback` defines the output returned if no matches for the regex are located.
         """
         self.regex_pattern = regex_pattern
         self.regex = re.compile(regex_pattern)
+        self.group_select = group_select
         self.fallback = fallback
 
     def apply(self, resps, docs):
@@ -23,9 +52,12 @@ def apply(self, resps, docs):
         def filter_set(inst):
             filtered = []
             for resp in inst:
-                match = self.regex.search(resp)
+                match = self.regex.findall(resp)
                 if match:
-                    match = match.group(1).strip()
+                    match = match[self.group_select]
+                    if isinstance(match, tuple):
+                        match = [m for m in match if m][0]
+                    match = match.strip()
                 else:
                     match = self.fallback
                 filtered.append(match)
@@ -38,23 +70,145 @@ def filter_set(inst):
         return filtered_resps
 
 
-class WhitespaceFilter(Filter):
-    """ """
+class MultiChoiceRegexFilter(RegexFilter):
+    """
+    A filter used to extract a model's answer on multiple choice questions with
+    letter answers. assumes each document has a "choices" field
+    containing the list of answer choices and that the answer label symbols
+    are of the form (A), (B), (C), ... or A, B, C.
+    """
 
-    def __init__(self) -> None:
-        pass
+    def __init__(
+        self,
+        regex_pattern: str = r"#### (\-?[0-9\.\,]+)",
+        group_select=0,
+        fallback: str = "[invalid]",
+        ignore_case=False,
+        ignore_punctuation=False,
+        regexes_to_ignore=None,
+    ) -> None:
+        """
+        regex_pattern: The basic regex pattern to use. If fails to match, we will use the customized match procedure
+                        - step 1 : We parse the choices between ([A-Z])s then try to find these choices in the response.
+                        - step 2 : We parse the choice with regex :[\s]*([A-?]), where ? varies by number of choices.
+        group_select: Selects the (group_select)th match from the findall result.
+        ignore_case: Ignores the case during step 1 matching
+        ignore_punctuation: Remove the punctuation during step 1 matching
+        regexes_to_ignore: Remove these regexes during step 1 matching
+        """
+        super().__init__(regex_pattern, group_select, fallback)
+        self.ignore_case = ignore_case
+        self.ignore_punctuation = ignore_punctuation
+        self.regexes_to_ignore = regexes_to_ignore
 
     def apply(self, resps, docs):
-        def filter_set(inst):
-            filtered_resp = []
-            for resp in inst:
-                if resp.startswith(" "):
-                    resp = resp[1:]
+        # here, we assume we have a list, in which each element is
+        # a list of model responses for some particular input/target pair.
+        # so we process each of these (same input/target response sets)
+        # independently (and keep them a list.)
 
-                filtered_resp.append(resp)
+        def find_match(regex, resp, convert_dict={}):
+            match = regex.findall(resp)
+            if match:
+                match = match[self.group_select]
+                if isinstance(match, tuple):
+                    match = [m for m in match if m][0]
+                match = match.strip()
+                if match and match in convert_dict:
+                    match = convert_dict[match]
+            return match
 
-            return filtered_resp
+        punct_tbl = dict.fromkeys(i for i in range(sys.maxunicode) if unicodedata.category(chr(i)).startswith("P"))
 
-        filtered_resps = [filter_set(resp) for resp in resps]
+        def filter_ignores(st):
+            if self.regexes_to_ignore is not None:
+                for s in self.regexes_to_ignore:
+                    st = re.sub(s, "", st)
+
+            if self.ignore_case:
+                st = st.lower()
+
+            if self.ignore_punctuation:
+                # https://stackoverflow.com/a/266162
+                st = st.translate(punct_tbl)
+            return st
+
+        filtered_resps = []
+
+        for r, doc in zip(resps, docs):
+            fallback_regexes = []
+            choice_to_alpha = {}
+            next_alpha = "A"
+
+            without_paren_fallback_regexes = []
+            without_paren_to_target = {}
+
+            choices = doc["choices"]
+            for c in choices:
+                m = filter_ignores(c.strip())
+                fallback_regexes.append(f"{re.escape(m)}")
+                choice_to_alpha[m] = f"({next_alpha})"
+
+                without_paren_fallback_regexes.append(next_alpha)
+                without_paren_to_target[next_alpha] = f"({next_alpha})"
+
+                next_alpha = chr(ord(next_alpha) + 1)
+            fallback_regex = re.compile("|".join(fallback_regexes))
+            without_paren_fallback_regex = "|".join(without_paren_fallback_regexes)
+            without_paren_fallback_regex = re.compile(f":[\s]*({without_paren_fallback_regex})")
+
+            filtered = []
+            for resp in r:
+                match = find_match(self.regex, resp)
+                if not match:
+                    match = find_match(fallback_regex, filter_ignores(resp), choice_to_alpha)
+                    if not match:
+                        match = find_match(without_paren_fallback_regex, resp, without_paren_to_target)
+                if not match:
+                    match = self.fallback
+                filtered.append(match)
+            filtered_resps.append(filtered)
 
         return filtered_resps
+
+
+class ExtendedRegexFilter(RegexFilter):
+    punct_tbl = dict.fromkeys(i for i in range(sys.maxunicode) if unicodedata.category(chr(i)).startswith("P"))
+
+    def __init__(
+        self,
+        regex_pattern: str = r"#### (\-?[0-9\.\,]+)",
+        group_select=0,
+        fallback: str = "[invalid]",
+        ignore_case=False,
+        ignore_punctuation=False,
+        regexes_to_ignore=None,
+    ) -> None:
+        super().__init__(regex_pattern, group_select, fallback)
+        self.ignore_case = ignore_case
+        self.ignore_punctuation = ignore_punctuation
+        self.regexes_to_ignore = regexes_to_ignore
+
+    def filter_ignores(self, st):
+        if self.regexes_to_ignore is not None:
+            for s in self.regexes_to_ignore:
+                st = re.sub(s, "", st)
+
+        if self.ignore_case:
+            st = st.lower()
+
+        if self.ignore_punctuation:
+            # https://stackoverflow.com/a/266162
+            st = st.translate(self.punct_tbl)
+        return st
+
+    def find_match(self, regex, resp, convert_dict={}):
+        match = regex.findall(resp)
+        if match:
+            match = match[self.group_select]
+            if isinstance(match, tuple):
+                match = [m for m in match if m][0]
+            match = match.strip()
+            if match and match in convert_dict:
+                match = convert_dict[match]
+        return match
@@ -2,6 +2,8 @@
 
 AVAILABLE_MODELS = {
     "llava": "Llava",
+    "llava_hf": "LlavaHf",
+    "llava_sglang": "LlavaSglang",
     "qwen_vl": "Qwen_VL",
     "fuyu": "Fuyu",
     "gpt4v": "GPT4V",