huggingface · gante · Aug 25, 2023 · Aug 13, 2023 · Aug 13, 2023 · Aug 14, 2023
diff --git a/docs/source/_config.py b/docs/source/_config.py
@@ -10,5 +10,5 @@
 black_avoid_patterns = {
     "{processor_class}": "FakeProcessorClass",
     "{model_class}": "FakeModelClass",
-    "{object_class}": "FakeObjectClass",    
+    "{object_class}": "FakeObjectClass",
 }
diff --git a/docs/source/en/_config.py b/docs/source/en/_config.py
@@ -10,5 +10,5 @@
 black_avoid_patterns = {
     "{processor_class}": "FakeProcessorClass",
     "{model_class}": "FakeModelClass",
-    "{object_class}": "FakeObjectClass",    
+    "{object_class}": "FakeObjectClass",
 }
diff --git a/docs/source/ko/_config.py b/docs/source/ko/_config.py
@@ -10,5 +10,5 @@
 black_avoid_patterns = {
     "{processor_class}": "FakeProcessorClass",
     "{model_class}": "FakeModelClass",
-    "{object_class}": "FakeObjectClass",    
+    "{object_class}": "FakeObjectClass",
 }
diff --git a/docs/source/pt/_config.py b/docs/source/pt/_config.py
@@ -10,5 +10,5 @@
 black_avoid_patterns = {
     "{processor_class}": "FakeProcessorClass",
     "{model_class}": "FakeModelClass",
-    "{object_class}": "FakeObjectClass",    
+    "{object_class}": "FakeObjectClass",
 }
diff --git a/examples/flax/text-classification/run_flax_glue.py b/examples/flax/text-classification/run_flax_glue.py
@@ -212,7 +212,7 @@ def __post_init__(self):
             if self.validation_file is not None:
                 extension = self.validation_file.split(".")[-1]
                 assert extension in ["csv", "json"], "`validation_file` should be a csv or a json file."
-        self.task_name = self.task_name.lower() if type(self.task_name) == str else self.task_name
+        self.task_name = self.task_name.lower() if isinstance(self.task_name, str) else self.task_name
 
 
 def create_train_state(

diff --git a/src/transformers/generation/logits_process.py b/src/transformers/generation/logits_process.py
@@ -1085,20 +1085,134 @@ def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> to
 
 class HammingDiversityLogitsProcessor(LogitsProcessor):
     r"""
-    [`LogitsProcessor`] that enforces diverse beam search. Note that this logits processor is only effective for
-    [`PreTrainedModel.group_beam_search`]. See [Diverse Beam Search: Decoding Diverse Solutions from Neural Sequence
-    Models](https://arxiv.org/pdf/1610.02424.pdf) for more details.
+    [`LogitsProcessor`] that enforces diverse beam search.
+
+    Note that this logits processor is only effective for [`PreTrainedModel.group_beam_search`]. See [Diverse Beam
+    Search: Decoding Diverse Solutions from Neural Sequence Models](https://arxiv.org/pdf/1610.02424.pdf) for more
+    details.
+
+    <Tip>
+
+    Diverse beam search can be particularly useful in scenarios where a variety of different outputs is desired, rather
+    than multiple similar sequences. It allows the model to explore different generation paths and provides a broader
+    coverage of possible outputs.
+
+    </Tip>
+
+    <Warning>
+
+    This logits processor can be resource-intensive, especially when using large models or long sequences.
+
+    </Warning>
+
+    Traditional beam search often generates very similar sequences across different beams.
+
+    The `HammingDiversityLogitsProcessor` addresses this by penalizing beams that generate tokens already chosen by
+    other beams in the same time step.
+
+    How It Works:
+    - **Grouping Beams**: Beams are divided into groups. Each group selects tokens independently of the others.
+    - **Penalizing Repeated Tokens**: If a beam in a group selects a token already chosen by another group in the same
+      step, a penalty is applied to that token's score.
+    - **Promoting Diversity**: This penalty discourages beams within a group from selecting the same tokens as beams in
+      other groups.
+
+    Benefits:
+    - **Diverse Outputs**: Produces a variety of different sequences.
+    - **Exploration**: Allows the model to explore different paths.
 
     Args:
         diversity_penalty (`float`):
             This value is subtracted from a beam's score if it generates a token same as any beam from other group at a
             particular time. Note that `diversity_penalty` is only effective if `group beam search` is enabled.
+                    -- The penalty applied to a beam's score when it generates a token that has already been chosen
+                            by another beam within the same group during the same time step.
+                    -- A higher `diversity_penalty` will enforce greater diversity among the beams,
+                            making it less likely for multiple beams to choose the same token.
+                    -- Conversely, a lower penalty will allow beams to more freely choose similar tokens. -- Adjusting
+                    this value can help strike a balance between diversity and natural likelihood.
         num_beams (`int`):
             Number of beams used for group beam search. See [this paper](https://arxiv.org/pdf/1610.02424.pdf) for more
             details.
+                    -- Beam search is a method used that maintains beams (or "multiple hypotheses") at each step,
+                            expanding each one and keeping the top-scoring sequences.
+                    -- A higher `num_beams` will explore more potential sequences -- This can increase chances of
+                    finding a high-quality output but also increases computational cost.
         num_beam_groups (`int`):
             Number of groups to divide `num_beams` into in order to ensure diversity among different groups of beams.
             See [this paper](https://arxiv.org/pdf/1610.02424.pdf) for more details.
+                    -- Each group of beams will operate independently, selecting tokens without considering the choices
+                    of other groups. -- This division promotes diversity by ensuring that beams within different groups
+                    explore different paths. -- For instance, if `num_beams` is 6 and `num_beam_groups` is 2, there
+                    will be 2 groups each containing 3 beams. -- The choice of `num_beam_groups` should be made
+                    considering the desired level of output diversity and the total number of beams.
+
+
+    Example: the below example shows a comparison before and after applying Hamming Diversity.
+
+    ```python
+            from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
+
+            # Initialize the model and tokenizer
+            tokenizer = AutoTokenizer.from_pretrained("t5-base")
+            model = AutoModelForSeq2SeqLM.from_pretrained("t5-base")
+
+            # Input variable is a long text about space:
+
+            text = "The Solar System is a gravitationally bound system comprising the Sun and the objects that orbit it, either directly or indirectly. Of the objects that orbit the Sun directly, the largest are the eight planets, with the remainder being smaller objects, such as the five dwarf planets and small Solar System bodies. The Solar System formed 4.6 billion years ago from the gravitational collapse of a giant interstellar molecular cloud."
+
+            # Prepare the input
+            encoder_input_str = "summarize: " + text
+            encoder_input_ids = tokenizer(encoder_input_str, return_tensors="pt").input_ids
+
+            # Set the parameters for diverse beam search
+            num_beams = 8  # higher is more diverse
+            num_beam_groups = 4  # 4 groups of 2 beams will explore 4*2=8 beams (=num_beams). by separating the beams into groups and applying penalties within groups, the model is encouraged to explore different sequence possibilities in each group
+            diversity_penalty = 5.5  # enforces diversity among different groups of beams, discourages beams within a group from selecting the same tokens
+
+            # Generate three diverse summaries using the `generate` method
+            outputs_diverse = model.generate(
+                encoder_input_ids,
+                max_length=100,
+                num_beams=num_beams,
+                num_beam_groups=num_beam_groups,
+                diversity_penalty=diversity_penalty,
+                no_repeat_ngram_size=2,
+                early_stopping=True,
+                num_return_sequences=3,
+            )
+
+            # Generate two non-diverse summaries
+            outputs_non_diverse = model.generate(
+                encoder_input_ids,
+                max_length=100,
+                num_beams=num_beams,
+                no_repeat_ngram_size=2,
+                early_stopping=True,
+                num_return_sequences=2,
+            )
+
+            # Decode and print the summaries
+            summaries_diverse = tokenizer.batch_decode(outputs_diverse, skip_special_tokens=True)
+            summaries_non_diverse = tokenizer.batch_decode(outputs_non_diverse, skip_special_tokens=True)
+
+            # Print the results
+            print("Diverse Summaries:")
+            for summary in summaries_diverse:
+                print(summary)
+                # summary 1:  the solar system formed 4.6 billion years ago from the collapse of a giant interstellar molecular cloud
+                # summary 2: the solar system formed 4.6 billion years ago from the collapse of a giant interstellar molecular cloud. of the objects that orbit the Sun directly, the largest are the eight planets, says john mccartney jr.
+                # summary 3: solar system formed 4.6 billion years ago from collapse of interstellar molecular cloud. largest of the eight planets orbit the Sun directly, with the remainder being smaller objects, such as dwarf planet and small solar System bodies - nicolaus mills-simons: the largest are the dwarf worlds and the solar systems' bodies.
+
+            print("\nNon-Diverse Summaries:")
+            for summary in summaries_non_diverse:
+                print(summary)
+                # summary 1:  the solar system formed 4.6 billion years ago from the collapse of a giant interstellar molecular cloud.
+                # summary 2:  the solar system formed 4.6 billion years ago from the collapse of a giant interstellar molecular cloud.
+    ```
+                For more details, see [Diverse Beam Search: Decoding Diverse Solutions from Neural Sequence
+                Models](https://arxiv.org/pdf/1610.02424.pdf).
+
     """
 
     def __init__(self, diversity_penalty: float, num_beams: int, num_beam_groups: int):

diff --git a/src/transformers/models/esm/modeling_esmfold.py b/src/transformers/models/esm/modeling_esmfold.py
@@ -1060,7 +1060,7 @@ def __init__(self, r: float, batch_dim: Union[int, List[int]]):
         super().__init__()
 
         self.r = r
-        if type(batch_dim) == int:
+        if isinstance(batch_dim, int):
             batch_dim = [batch_dim]
         self.batch_dim = batch_dim
         self.dropout = nn.Dropout(self.r)
@@ -2254,7 +2254,7 @@ def infer(
         seqs: Union[str, List[str]],
         position_ids=None,
     ):
-        if type(seqs) is str:
+        if isinstance(seqs, str):
             lst = [seqs]
         else:
             lst = seqs
@@ -2312,7 +2312,7 @@ def output_to_pdb(output: Dict) -> List[str]:
 
     def infer_pdb(self, seqs, *args, **kwargs) -> str:
         """Returns the pdb (file) string from the model given an input sequence."""
-        assert type(seqs) is str
+        assert isinstance(seqs, str)
         output = self.infer(seqs, *args, **kwargs)
         return self.output_to_pdb(output)[0]
 

diff --git a/src/transformers/models/llama/tokenization_llama.py b/src/transformers/models/llama/tokenization_llama.py
@@ -409,8 +409,8 @@ def _build_conversation_input_ids(self, conversation: "Conversation") -> List[in
                 raise ValueError("Last message must be from user")
 
         dialogue = list(conversation.iter_texts())
-        if not all([is_user for is_user, msg in dialogue[::2]]) or not all(
-            [not is_user for is_user, msg in dialogue[1::2]]
+        if not all(is_user for is_user, msg in dialogue[::2]) or not all(
+            not is_user for is_user, msg in dialogue[1::2]
         ):
             raise ValueError(
                 "The model only supports 'user' and 'assistant' roles, starting with user and alternating (u/a/u/a/u...)"

diff --git a/src/transformers/models/llama/tokenization_llama_fast.py b/src/transformers/models/llama/tokenization_llama_fast.py
@@ -230,8 +230,8 @@ def _build_conversation_input_ids(self, conversation: "Conversation"):
                 raise ValueError("Last message must be from user")
 
         dialogue = list(conversation.iter_texts())
-        if not all([is_user for is_user, msg in dialogue[::2]]) or not all(
-            [not is_user for is_user, msg in dialogue[1::2]]
+        if not all(is_user for is_user, msg in dialogue[::2]) or not all(
+            not is_user for is_user, msg in dialogue[1::2]
         ):
             raise ValueError(
                 "The model only supports 'user' and 'assistant' roles, starting with user and alternating (u/a/u/a/u...)"

diff --git a/tests/test_modeling_common.py b/tests/test_modeling_common.py
@@ -1955,7 +1955,7 @@ def prepare_tf_inputs_from_pt_inputs(self, pt_inputs_dict):
         tf_inputs_dict = {}
         for key, tensor in pt_inputs_dict.items():
             # skip key that does not exist in tf
-            if type(tensor) == bool:
+            if isinstance(tensor, bool):
                 tf_inputs_dict[key] = tensor
             elif key == "input_values":
                 tf_inputs_dict[key] = tf.convert_to_tensor(tensor.cpu().numpy(), dtype=tf.float32)

diff --git a/tests/test_modeling_tf_common.py b/tests/test_modeling_tf_common.py
@@ -576,7 +576,7 @@ def check_pt_tf_outputs(self, tf_outputs, pt_outputs, model_class, tol=1e-5, nam
     def prepare_pt_inputs_from_tf_inputs(self, tf_inputs_dict):
         pt_inputs_dict = {}
         for name, key in tf_inputs_dict.items():
-            if type(key) == bool:
+            if isinstance(key, bool):
                 pt_inputs_dict[name] = key
             elif name == "input_values":
                 pt_inputs_dict[name] = torch.from_numpy(key.numpy()).to(torch.float32)