huggingface · ydshieh · Mar 7, 2023 · Mar 7, 2023 · Mar 7, 2023 · Mar 7, 2023
diff --git a/src/transformers/models/auto/feature_extraction_auto.py b/src/transformers/models/auto/feature_extraction_auto.py
@@ -82,6 +82,7 @@
         ("swinv2", "ViTFeatureExtractor"),
         ("table-transformer", "DetrFeatureExtractor"),
         ("timesformer", "VideoMAEFeatureExtractor"),
+        ("tvlt", "TvltFeatureExtractor"),
         ("unispeech", "Wav2Vec2FeatureExtractor"),
         ("unispeech-sat", "Wav2Vec2FeatureExtractor"),
         ("van", "ConvNextFeatureExtractor"),

diff --git a/src/transformers/models/auto/image_processing_auto.py b/src/transformers/models/auto/image_processing_auto.py
@@ -87,6 +87,7 @@
         ("swinv2", "ViTImageProcessor"),
         ("table-transformer", "DetrImageProcessor"),
         ("timesformer", "VideoMAEImageProcessor"),
+        ("tvlt", "TvltImageProcessor"),
         ("upernet", "SegformerImageProcessor"),
         ("van", "ConvNextImageProcessor"),
         ("videomae", "VideoMAEImageProcessor"),

diff --git a/src/transformers/models/auto/processing_auto.py b/src/transformers/models/auto/processing_auto.py
@@ -65,6 +65,7 @@
         ("speech_to_text_2", "Speech2Text2Processor"),
         ("speecht5", "SpeechT5Processor"),
         ("trocr", "TrOCRProcessor"),
+        ("tvlt", "TvltProcessor"),
         ("unispeech", "Wav2Vec2Processor"),
         ("unispeech-sat", "Wav2Vec2Processor"),
         ("vilt", "ViltProcessor"),

diff --git a/src/transformers/models/gptsan_japanese/configuration_gptsan_japanese.py b/src/transformers/models/gptsan_japanese/configuration_gptsan_japanese.py
@@ -31,8 +31,7 @@ class GPTSanJapaneseConfig(PretrainedConfig):
     This is the configuration class to store the configuration of a [`GPTSanJapaneseModel`]. It is used to instantiate
     a GPTSANJapanese model according to the specified arguments, defining the model architecture. Instantiating a
     configuration with the defaults will yield a similar configuration to that of the GPTSANJapanese
-    [tanreinama/GPTSAN-2.8B-spout_is_uniform](https://huggingface.co/tanreinama/GPTSAN-2.8B-spout_is_uniform)
-    architecture.
+    [Tanrei/GPTSAN-japanese](https://huggingface.co/Tanrei/GPTSAN-japanese) architecture.
 
     Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
     documentation from [`PretrainedConfig`] for more information.

diff --git a/src/transformers/models/timesformer/configuration_timesformer.py b/src/transformers/models/timesformer/configuration_timesformer.py
@@ -30,7 +30,8 @@ class TimesformerConfig(PretrainedConfig):
     This is the configuration class to store the configuration of a [`TimesformerModel`]. It is used to instantiate a
     TimeSformer model according to the specified arguments, defining the model architecture. Instantiating a
     configuration with the defaults will yield a similar configuration to that of the TimeSformer
-    [facebook/timesformer](https://huggingface.co/facebook/timesformer-base-finetuned-k600) architecture.
+    [facebook/timesformer-base-finetuned-k600](https://huggingface.co/facebook/timesformer-base-finetuned-k600)
+    architecture.
 
     Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
     documentation from [`PretrainedConfig`] for more information.

diff --git a/src/transformers/models/tvlt/configuration_tvlt.py b/src/transformers/models/tvlt/configuration_tvlt.py
@@ -30,7 +30,7 @@ class TvltConfig(PretrainedConfig):
     This is the configuration class to store the configuration of a [`TvltModel`]. It is used to instantiate a TVLT
     model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
     defaults will yield a similar configuration to that of the TVLT
-    [TVLT/tvlt-base](https://huggingface.co/ZinengTang/tvlt-base) architecture.
+    [ZinengTang/tvlt-base](https://huggingface.co/ZinengTang/tvlt-base) architecture.
 
     Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
     documentation from [`PretrainedConfig`] for more information.

diff --git a/src/transformers/models/xmod/configuration_xmod.py b/src/transformers/models/xmod/configuration_xmod.py
@@ -41,8 +41,8 @@ class XmodConfig(PretrainedConfig):
     r"""
     This is the configuration class to store the configuration of a [`XmodModel`]. It is used to instantiate an X-MOD
     model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
-    defaults will yield a similar configuration to that of the [xmod-base](https://huggingface.co/facebook/xmod-base)
-    architecture.
+    defaults will yield a similar configuration to that of the
+    [facebook/xmod-base](https://huggingface.co/facebook/xmod-base) architecture.
 
     Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
     documentation from [`PretrainedConfig`] for more information.

diff --git a/tests/models/oneformer/test_modeling_oneformer.py b/tests/models/oneformer/test_modeling_oneformer.py
@@ -56,6 +56,7 @@ def __init__(
         parent,
         batch_size=2,
         is_training=True,
+        vocab_size=99,
         use_auxiliary_loss=False,
         num_queries=10,
         num_channels=3,
@@ -69,6 +70,7 @@ def __init__(
         self.parent = parent
         self.batch_size = batch_size
         self.is_training = is_training
+        self.vocab_size = vocab_size
         self.use_auxiliary_loss = use_auxiliary_loss
         self.num_queries = num_queries
         self.num_channels = num_channels
@@ -84,12 +86,16 @@ def prepare_config_and_inputs(self):
             torch_device
         )
 
-        task_inputs = torch.randint(high=49408, size=(self.batch_size, self.sequence_length)).to(torch_device).long()
+        task_inputs = (
+            torch.randint(high=self.vocab_size, size=(self.batch_size, self.sequence_length)).to(torch_device).long()
+        )
 
         pixel_mask = torch.ones([self.batch_size, self.min_size, self.max_size], device=torch_device)
 
         text_inputs = (
-            torch.randint(high=49408, size=(self.batch_size, self.num_queries - self.n_ctx, self.sequence_length))
+            torch.randint(
+                high=self.vocab_size, size=(self.batch_size, self.num_queries - self.n_ctx, self.sequence_length)
+            )
             .to(torch_device)
             .long()
         )
@@ -104,6 +110,7 @@ def prepare_config_and_inputs(self):
 
     def get_config(self):
         config = OneFormerConfig(
+            text_encoder_vocab_size=self.vocab_size,
             hidden_size=self.hidden_dim,
         )
 
@@ -303,8 +310,10 @@ def test_model_with_labels(self):
         size = (self.model_tester.min_size,) * 2
         inputs = {
             "pixel_values": torch.randn((2, 3, *size), device=torch_device),
-            "task_inputs": torch.randint(high=49408, size=(2, 77), device=torch_device).long(),
-            "text_inputs": torch.randint(high=49408, size=(2, 134, 77), device=torch_device).long(),
+            "task_inputs": torch.randint(high=self.model_tester.vocab_size, size=(2, 77), device=torch_device).long(),
+            "text_inputs": torch.randint(
+                high=self.model_tester.vocab_size, size=(2, 134, 77), device=torch_device
+            ).long(),
             "mask_labels": torch.randn((2, 150, *size), device=torch_device),
             "class_labels": torch.zeros(2, 150, device=torch_device).long(),
         }

diff --git a/tests/models/speecht5/test_modeling_speecht5.py b/tests/models/speecht5/test_modeling_speecht5.py
@@ -103,6 +103,7 @@ def __init__(
         batch_size=13,
         seq_length=7,
         is_training=False,
+        vocab_size=81,
         hidden_size=24,
         num_hidden_layers=4,
         num_attention_heads=2,
@@ -112,6 +113,7 @@ def __init__(
         self.batch_size = batch_size
         self.seq_length = seq_length
         self.is_training = is_training
+        self.vocab_size = vocab_size
         self.hidden_size = hidden_size
         self.num_hidden_layers = num_hidden_layers
         self.num_attention_heads = num_attention_heads
@@ -140,6 +142,7 @@ def prepare_config_and_inputs_for_common(self):
 
     def get_config(self):
         return SpeechT5Config(
+            vocab_size=self.vocab_size,
             hidden_size=self.hidden_size,
             encoder_layers=self.num_hidden_layers,
             decoder_layers=self.num_hidden_layers,

diff --git a/utils/check_config_docstrings.py b/utils/check_config_docstrings.py
@@ -51,10 +51,12 @@ def get_checkpoint_from_config_class(config_class):
     config_source = inspect.getsource(config_class)
     checkpoints = _re_checkpoint.findall(config_source)
 
-    for checkpoint in checkpoints:
-        # Each `checkpoint` is a tuple of a checkpoint name and a checkpoint link.
-        # For example, `('bert-base-uncased', 'https://huggingface.co/bert-base-uncased')`
-        ckpt_name, ckpt_link = checkpoint
+    # Each `checkpoint` is a tuple of a checkpoint name and a checkpoint link.
+    # For example, `('bert-base-uncased', 'https://huggingface.co/bert-base-uncased')`
+    for ckpt_name, ckpt_link in checkpoints:
+        # allow the link to end with `/`
+        if ckpt_link.endswith("/"):
+            ckpt_link = ckpt_link[:-1]
 
         # verify the checkpoint name corresponds to the checkpoint link
         ckpt_link_from_name = f"https://huggingface.co/{ckpt_name}"

diff --git a/utils/create_dummy_models.py b/utils/create_dummy_models.py
@@ -782,6 +782,11 @@ def get_config_overrides(config_class, processors):
     # CLIP-like models have `text_model_tester` and `vision_model_tester`, and we need to pass `vocab_size` to
     # `text_model_tester` via `text_kwargs`. The same trick is also necessary for `Flava`.
     if config_class.__name__ in [
+        "AlignConfig",
+        "AltCLIPConfig",
+        "ChineseCLIPConfig",
+        "CLIPSegConfig",
+        "ClapConfig",
         "CLIPConfig",
         "GroupViTConfig",
         "OwlViTConfig",