Skip to content
Merged
Show file tree
Hide file tree
Changes from 10 commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
ab75556
updated logits processor text
jesspeck Aug 13, 2023
5f3eb85
Update logits_process.py
jessthebp Aug 13, 2023
0b666a3
Merge branch 'huggingface:main' into dev-documentation-HammingDiversi…
jessthebp Aug 14, 2023
ad6ceb6
fixed formatting with black
jesspeck Aug 14, 2023
f54e125
Merge remote-tracking branch 'origin/dev-documentation-HammingDiversi…
jesspeck Aug 14, 2023
bfb1536
fixed formatting with black
jesspeck Aug 14, 2023
5651226
Merge branch 'huggingface:main' into dev-documentation-HammingDiversi…
jessthebp Aug 17, 2023
4764308
fixed formatting with Make Fixup
jesspeck Aug 17, 2023
a9c5b32
Merge branch 'huggingface:main' into dev-documentation-HammingDiversi…
jessthebp Aug 18, 2023
c7a9ae8
more formatting fixes
jesspeck Aug 18, 2023
0b9528c
Update src/transformers/generation/logits_process.py
jessthebp Aug 21, 2023
aa51216
Update src/transformers/generation/logits_process.py
jessthebp Aug 21, 2023
3584a53
Revert "fixed formatting with Make Fixup"
jesspeck Aug 21, 2023
8b50875
Revert "fixed formatting with black"
jesspeck Aug 23, 2023
b11f536
Revert "fixed formatting with Make Fixup"
jesspeck Aug 23, 2023
a21a3ab
Revert "fixed formatting with Make Fixup"
jesspeck Aug 23, 2023
aef619c
Revert "fixed formatting with black"
jesspeck Aug 23, 2023
f6c301b
Revert "fixed formatting with black"
jesspeck Aug 23, 2023
d34c714
Update src/transformers/generation/logits_process.py
jessthebp Aug 23, 2023
a29cbee
Merge remote-tracking branch 'origin/dev-documentation-HammingDiversi…
jesspeck Aug 23, 2023
e252405
Revert "fixed formatting with Make Fixup"
jesspeck Aug 23, 2023
33c682c
formatted logits_process with make fixup
jesspeck Aug 23, 2023
ba56724
Merge branch 'huggingface:main' into dev-documentation-HammingDiversi…
jessthebp Aug 23, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion docs/source/_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,5 +10,5 @@
black_avoid_patterns = {
"{processor_class}": "FakeProcessorClass",
"{model_class}": "FakeModelClass",
"{object_class}": "FakeObjectClass",
"{object_class}": "FakeObjectClass",
}
2 changes: 1 addition & 1 deletion docs/source/en/_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,5 +10,5 @@
black_avoid_patterns = {
"{processor_class}": "FakeProcessorClass",
"{model_class}": "FakeModelClass",
"{object_class}": "FakeObjectClass",
"{object_class}": "FakeObjectClass",
}
2 changes: 1 addition & 1 deletion docs/source/ko/_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,5 +10,5 @@
black_avoid_patterns = {
"{processor_class}": "FakeProcessorClass",
"{model_class}": "FakeModelClass",
"{object_class}": "FakeObjectClass",
"{object_class}": "FakeObjectClass",
}
2 changes: 1 addition & 1 deletion docs/source/pt/_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,5 +10,5 @@
black_avoid_patterns = {
"{processor_class}": "FakeProcessorClass",
"{model_class}": "FakeModelClass",
"{object_class}": "FakeObjectClass",
"{object_class}": "FakeObjectClass",
}
2 changes: 1 addition & 1 deletion examples/flax/text-classification/run_flax_glue.py
Original file line number Diff line number Diff line change
Expand Up @@ -212,7 +212,7 @@ def __post_init__(self):
if self.validation_file is not None:
extension = self.validation_file.split(".")[-1]
assert extension in ["csv", "json"], "`validation_file` should be a csv or a json file."
self.task_name = self.task_name.lower() if type(self.task_name) == str else self.task_name
self.task_name = self.task_name.lower() if isinstance(self.task_name, str) else self.task_name


def create_train_state(
Expand Down
120 changes: 117 additions & 3 deletions src/transformers/generation/logits_process.py
Original file line number Diff line number Diff line change
Expand Up @@ -1085,20 +1085,134 @@ def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> to

class HammingDiversityLogitsProcessor(LogitsProcessor):
r"""
[`LogitsProcessor`] that enforces diverse beam search. Note that this logits processor is only effective for
[`PreTrainedModel.group_beam_search`]. See [Diverse Beam Search: Decoding Diverse Solutions from Neural Sequence
Models](https://arxiv.org/pdf/1610.02424.pdf) for more details.
[`LogitsProcessor`] that enforces diverse beam search.

Note that this logits processor is only effective for [`PreTrainedModel.group_beam_search`]. See [Diverse Beam
Search: Decoding Diverse Solutions from Neural Sequence Models](https://arxiv.org/pdf/1610.02424.pdf) for more
details.

<Tip>

Diverse beam search can be particularly useful in scenarios where a variety of different outputs is desired, rather
than multiple similar sequences. It allows the model to explore different generation paths and provides a broader
coverage of possible outputs.

</Tip>

<Warning>

This logits processor can be resource-intensive, especially when using large models or long sequences.

</Warning>

Traditional beam search often generates very similar sequences across different beams.

The `HammingDiversityLogitsProcessor` addresses this by penalizing beams that generate tokens already chosen by
other beams in the same time step.

How It Works:
- **Grouping Beams**: Beams are divided into groups. Each group selects tokens independently of the others.
- **Penalizing Repeated Tokens**: If a beam in a group selects a token already chosen by another group in the same
step, a penalty is applied to that token's score.
- **Promoting Diversity**: This penalty discourages beams within a group from selecting the same tokens as beams in
other groups.

Benefits:
- **Diverse Outputs**: Produces a variety of different sequences.
- **Exploration**: Allows the model to explore different paths.

Args:
diversity_penalty (`float`):
This value is subtracted from a beam's score if it generates a token same as any beam from other group at a
particular time. Note that `diversity_penalty` is only effective if `group beam search` is enabled.
-- The penalty applied to a beam's score when it generates a token that has already been chosen
by another beam within the same group during the same time step.
-- A higher `diversity_penalty` will enforce greater diversity among the beams,
making it less likely for multiple beams to choose the same token.
-- Conversely, a lower penalty will allow beams to more freely choose similar tokens. -- Adjusting
this value can help strike a balance between diversity and natural likelihood.
num_beams (`int`):
Number of beams used for group beam search. See [this paper](https://arxiv.org/pdf/1610.02424.pdf) for more
details.
-- Beam search is a method used that maintains beams (or "multiple hypotheses") at each step,
expanding each one and keeping the top-scoring sequences.
-- A higher `num_beams` will explore more potential sequences -- This can increase chances of
finding a high-quality output but also increases computational cost.
num_beam_groups (`int`):
Number of groups to divide `num_beams` into in order to ensure diversity among different groups of beams.
See [this paper](https://arxiv.org/pdf/1610.02424.pdf) for more details.
-- Each group of beams will operate independently, selecting tokens without considering the choices
of other groups. -- This division promotes diversity by ensuring that beams within different groups
explore different paths. -- For instance, if `num_beams` is 6 and `num_beam_groups` is 2, there
will be 2 groups each containing 3 beams. -- The choice of `num_beam_groups` should be made
considering the desired level of output diversity and the total number of beams.


Example: the below example shows a comparison before and after applying Hamming Diversity.

```python
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Final nit 1: let's remove indentation in the example, it shows up awkwardly in the rendered docs (see here the preview)

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You also need to add >>> before the code as it is done for every other logit processor.

    Examples:

    ```python
    >>> from transformers import AutoTokenizer, AutoModelForCausalLM

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

oh, yeah, and the missing >>> / ..., good catch!


# Initialize the model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("t5-base")
model = AutoModelForSeq2SeqLM.from_pretrained("t5-base")

# Input variable is a long text about space:

text = "The Solar System is a gravitationally bound system comprising the Sun and the objects that orbit it, either directly or indirectly. Of the objects that orbit the Sun directly, the largest are the eight planets, with the remainder being smaller objects, such as the five dwarf planets and small Solar System bodies. The Solar System formed 4.6 billion years ago from the gravitational collapse of a giant interstellar molecular cloud."

# Prepare the input
encoder_input_str = "summarize: " + text
encoder_input_ids = tokenizer(encoder_input_str, return_tensors="pt").input_ids

# Set the parameters for diverse beam search
num_beams = 8 # higher is more diverse
num_beam_groups = 4 # 4 groups of 2 beams will explore 4*2=8 beams (=num_beams). by separating the beams into groups and applying penalties within groups, the model is encouraged to explore different sequence possibilities in each group
diversity_penalty = 5.5 # enforces diversity among different groups of beams, discourages beams within a group from selecting the same tokens

# Generate three diverse summaries using the `generate` method
outputs_diverse = model.generate(
encoder_input_ids,
max_length=100,
num_beams=num_beams,
num_beam_groups=num_beam_groups,
diversity_penalty=diversity_penalty,
no_repeat_ngram_size=2,
early_stopping=True,
num_return_sequences=3,
)

# Generate two non-diverse summaries
outputs_non_diverse = model.generate(
encoder_input_ids,
max_length=100,
num_beams=num_beams,
no_repeat_ngram_size=2,
early_stopping=True,
num_return_sequences=2,
)

# Decode and print the summaries
summaries_diverse = tokenizer.batch_decode(outputs_diverse, skip_special_tokens=True)
summaries_non_diverse = tokenizer.batch_decode(outputs_non_diverse, skip_special_tokens=True)

# Print the results
print("Diverse Summaries:")
for summary in summaries_diverse:
print(summary)
# summary 1: the solar system formed 4.6 billion years ago from the collapse of a giant interstellar molecular cloud
# summary 2: the solar system formed 4.6 billion years ago from the collapse of a giant interstellar molecular cloud. of the objects that orbit the Sun directly, the largest are the eight planets, says john mccartney jr.
# summary 3: solar system formed 4.6 billion years ago from collapse of interstellar molecular cloud. largest of the eight planets orbit the Sun directly, with the remainder being smaller objects, such as dwarf planet and small solar System bodies - nicolaus mills-simons: the largest are the dwarf worlds and the solar systems' bodies.

print("\nNon-Diverse Summaries:")
for summary in summaries_non_diverse:
print(summary)
# summary 1: the solar system formed 4.6 billion years ago from the collapse of a giant interstellar molecular cloud.
# summary 2: the solar system formed 4.6 billion years ago from the collapse of a giant interstellar molecular cloud.
```
For more details, see [Diverse Beam Search: Decoding Diverse Solutions from Neural Sequence
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Final nit 2: let's move this to above the example, and decrease the indentation -- it is not rendering correctly (see here the preview)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Argh, sorry. Just autoaccepted the formatting suggestions from the CLI. I'll revert the weird/unnecessary changes + fix the formatting for the logits comment.

Models](https://arxiv.org/pdf/1610.02424.pdf).

"""

def __init__(self, diversity_penalty: float, num_beams: int, num_beam_groups: int):
Expand Down
6 changes: 3 additions & 3 deletions src/transformers/models/esm/modeling_esmfold.py
Original file line number Diff line number Diff line change
Expand Up @@ -1060,7 +1060,7 @@ def __init__(self, r: float, batch_dim: Union[int, List[int]]):
super().__init__()

self.r = r
if type(batch_dim) == int:
if isinstance(batch_dim, int):
batch_dim = [batch_dim]
self.batch_dim = batch_dim
self.dropout = nn.Dropout(self.r)
Expand Down Expand Up @@ -2254,7 +2254,7 @@ def infer(
seqs: Union[str, List[str]],
position_ids=None,
):
if type(seqs) is str:
if isinstance(seqs, str):
lst = [seqs]
else:
lst = seqs
Expand Down Expand Up @@ -2312,7 +2312,7 @@ def output_to_pdb(output: Dict) -> List[str]:

def infer_pdb(self, seqs, *args, **kwargs) -> str:
"""Returns the pdb (file) string from the model given an input sequence."""
assert type(seqs) is str
assert isinstance(seqs, str)
output = self.infer(seqs, *args, **kwargs)
return self.output_to_pdb(output)[0]

Expand Down
4 changes: 2 additions & 2 deletions src/transformers/models/llama/tokenization_llama.py
Original file line number Diff line number Diff line change
Expand Up @@ -409,8 +409,8 @@ def _build_conversation_input_ids(self, conversation: "Conversation") -> List[in
raise ValueError("Last message must be from user")

dialogue = list(conversation.iter_texts())
if not all([is_user for is_user, msg in dialogue[::2]]) or not all(
[not is_user for is_user, msg in dialogue[1::2]]
if not all(is_user for is_user, msg in dialogue[::2]) or not all(
not is_user for is_user, msg in dialogue[1::2]
):
raise ValueError(
"The model only supports 'user' and 'assistant' roles, starting with user and alternating (u/a/u/a/u...)"
Expand Down
4 changes: 2 additions & 2 deletions src/transformers/models/llama/tokenization_llama_fast.py
Original file line number Diff line number Diff line change
Expand Up @@ -230,8 +230,8 @@ def _build_conversation_input_ids(self, conversation: "Conversation"):
raise ValueError("Last message must be from user")

dialogue = list(conversation.iter_texts())
if not all([is_user for is_user, msg in dialogue[::2]]) or not all(
[not is_user for is_user, msg in dialogue[1::2]]
if not all(is_user for is_user, msg in dialogue[::2]) or not all(
not is_user for is_user, msg in dialogue[1::2]
):
raise ValueError(
"The model only supports 'user' and 'assistant' roles, starting with user and alternating (u/a/u/a/u...)"
Expand Down
2 changes: 1 addition & 1 deletion tests/test_modeling_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -1955,7 +1955,7 @@ def prepare_tf_inputs_from_pt_inputs(self, pt_inputs_dict):
tf_inputs_dict = {}
for key, tensor in pt_inputs_dict.items():
# skip key that does not exist in tf
if type(tensor) == bool:
if isinstance(tensor, bool):
tf_inputs_dict[key] = tensor
elif key == "input_values":
tf_inputs_dict[key] = tf.convert_to_tensor(tensor.cpu().numpy(), dtype=tf.float32)
Expand Down
2 changes: 1 addition & 1 deletion tests/test_modeling_tf_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -576,7 +576,7 @@ def check_pt_tf_outputs(self, tf_outputs, pt_outputs, model_class, tol=1e-5, nam
def prepare_pt_inputs_from_tf_inputs(self, tf_inputs_dict):
pt_inputs_dict = {}
for name, key in tf_inputs_dict.items():
if type(key) == bool:
if isinstance(key, bool):
pt_inputs_dict[name] = key
elif name == "input_values":
pt_inputs_dict[name] = torch.from_numpy(key.numpy()).to(torch.float32)
Expand Down