Skip to content
Merged
Show file tree
Hide file tree
Changes from 58 commits
Commits
Show all changes
94 commits
Select commit Hold shift + click to select a range
05ef62e
Add templates for gpt-sw3
ekgren Nov 14, 2022
aa2fb95
Add templates for gpt-sw3
ekgren Nov 14, 2022
1778a3f
Added sentencepiece tokenizer
ekgren Nov 14, 2022
a327bc9
intermediate commit with many changes
ekgren Nov 16, 2022
85a8643
gpt-sw3 updates
ekgren Nov 16, 2022
6e05043
fixed conflicts
ekgren Nov 16, 2022
e333cd2
Init commit for tokenization port
JoeyOhman Nov 16, 2022
268b116
Tokenization progress
JoeyOhman Nov 17, 2022
e6d806a
Remove fast tokenizer
JoeyOhman Nov 18, 2022
e5b05e4
Clean up and rename spm.model -> spiece.model
JoeyOhman Nov 18, 2022
17bbc59
Remove TF -> PT conversion script template, Clean up Megatron -> PT s…
JoeyOhman Nov 18, 2022
4167192
Optimize encode & decode performance
JoeyOhman Nov 21, 2022
26d522b
added new attention
ekgren Nov 21, 2022
96f5d0e
added new attention
ekgren Nov 21, 2022
a94aa0e
Merge branch 'add_gpt_sw3' of github.com:ekgren/transformers-hf into …
ekgren Nov 21, 2022
39a8e8f
attention for gpt-sw3 working
ekgren Nov 22, 2022
1d7759f
attention good
ekgren Nov 22, 2022
b7ef07a
Cache is now working
JoeyOhman Nov 22, 2022
39892bf
fixed attention mask so that it works with causal attention
ekgren Nov 22, 2022
891cfb0
fixed badbmm bug for cpu and caching
ekgren Nov 24, 2022
8ed7fb2
updated config with correct parameters
ekgren Nov 24, 2022
eb1336b
Refactor and leave optimizations as separate functions to avoid break…
JoeyOhman Nov 24, 2022
b9be87f
Fix special tokens mapping for both tokenizers
JoeyOhman Nov 25, 2022
6d8de24
cleaning up of code and comments
ekgren Nov 28, 2022
65577c2
fixed conflicts in convert script
ekgren Nov 28, 2022
285b33b
HF compatible attention outputs
JoeyOhman Nov 28, 2022
682556f
Tokenizer now passing tests, add documentation
JoeyOhman Nov 29, 2022
d3a143e
Update documentation
JoeyOhman Nov 29, 2022
5e40481
reverted back to base implementation after checking that it is identi…
ekgren Nov 29, 2022
eb1d4cb
updated gpt-sw3 config
ekgren Nov 29, 2022
01192f0
updated conversion script
ekgren Nov 29, 2022
29bebce
Merge branch 'add_gpt_sw3' of github.com:ekgren/transformers-hf into …
ekgren Nov 29, 2022
bfde918
aligned parameters with gpt-sw3 config
ekgren Nov 29, 2022
988a9ca
changed default scale_attn_by_inverse_layer_idx to true
ekgren Nov 29, 2022
4abb731
removed flag from conversion script
ekgren Nov 29, 2022
fe2e353
added temporary model path
ekgren Nov 29, 2022
b0f94a9
reverted back to functioning convert script
ekgren Nov 29, 2022
cfef112
small changes to default config
ekgren Nov 29, 2022
cefe37b
updated tests for gpt-sw3
ekgren Nov 29, 2022
fa815f6
Merge remote-tracking branch 'upstream/main' into add_gpt_sw3
ekgren Nov 29, 2022
a76f00f
Merge remote-tracking branch 'upstream/main' into add_gpt_sw3
ekgren Nov 30, 2022
9e9742a
make style, make quality, minor cleanup
JoeyOhman Nov 30, 2022
4524076
Change local paths to testing online repository
JoeyOhman Nov 30, 2022
1ef7b47
Change name: GptSw3 -> GPTSw3
JoeyOhman Nov 30, 2022
f50cf3d
Remove GPTSw3TokenizerFast references
JoeyOhman Nov 30, 2022
07caf98
Use official model repository and add more model sizes
JoeyOhman Nov 30, 2022
199f260
Added reference to 6.7b model
ekgren Nov 30, 2022
964227f
Add GPTSw3DoubleHeadsModel to IGNORE_NON_AUTO_CONFIGURED, like GPT2Do…
JoeyOhman Dec 1, 2022
cfa82f7
Remove pointers to non-existing TFGPTSw3
JoeyOhman Dec 1, 2022
e22608e
Add GPTSw3 to docs/_toctree.yml
JoeyOhman Dec 1, 2022
d6d8eb2
Remove TF artifacts from GPTSw3 in __init__ files
JoeyOhman Dec 1, 2022
65ae666
Merge remote-tracking branch 'upstream/main' into add_gpt_sw3
JoeyOhman Dec 1, 2022
fca7bcf
Update README:s with 'make fix-copies'
JoeyOhman Dec 1, 2022
bc5ae81
Add 20b model to archive list
JoeyOhman Dec 1, 2022
05fb9d8
Add documentation for GPT-Sw3
JoeyOhman Dec 1, 2022
26783cd
Fix typo in documentation for GPT-Sw3
JoeyOhman Dec 1, 2022
d8d8ff5
Do 'make fix-copies' again after having updated docs
JoeyOhman Dec 1, 2022
7795ce4
Fix some typos in docs
JoeyOhman Dec 1, 2022
b353072
Update src/transformers/models/gpt_sw3/configuration_gpt_sw3.py
JoeyOhman Dec 5, 2022
30706f7
Update src/transformers/models/gpt_sw3/configuration_gpt_sw3.py
JoeyOhman Dec 5, 2022
9e6f545
Update src/transformers/models/gpt_sw3/__init__.py
JoeyOhman Dec 5, 2022
2e44e88
Update src/transformers/models/gpt_sw3/__init__.py
JoeyOhman Dec 5, 2022
a15ee21
Update src/transformers/models/gpt_sw3/convert_megatron_to_pytorch.py
JoeyOhman Dec 5, 2022
5e18908
Update src/transformers/models/gpt_sw3/modeling_gpt_sw3.py
JoeyOhman Dec 5, 2022
0d02ec5
Update tests/models/gpt_sw3/test_tokenization_gpt_sw3.py
JoeyOhman Dec 5, 2022
195bd0c
Update src/transformers/models/gpt_sw3/modeling_gpt_sw3.py
JoeyOhman Dec 5, 2022
5140f52
Update src/transformers/models/gpt_sw3/modeling_gpt_sw3.py
JoeyOhman Dec 5, 2022
718c441
Resolve comments from PR feedback
JoeyOhman Dec 5, 2022
3ab3643
Merge branch 'add_gpt_sw3' of github.com:ekgren/transformers-hf into …
JoeyOhman Dec 5, 2022
d309d22
Resolve more comments from PR feedback, also set use_cache=True in co…
JoeyOhman Dec 5, 2022
98002ab
Add '# Copied from' comments for GPTSw3 modeling
JoeyOhman Dec 6, 2022
5bafb6a
Set 'is_parallelizable = False'
JoeyOhman Dec 6, 2022
cc2b702
Remove '# Copied from' where code was modified and add 'with x->y' wh…
JoeyOhman Dec 6, 2022
81bf9ca
Remove parallelize in mdx
JoeyOhman Dec 6, 2022
714d7fb
make style, make quality
JoeyOhman Dec 6, 2022
2cfaa1c
Update GPTSw3Config default values and corresponding documentation
JoeyOhman Dec 6, 2022
2fcee22
Update src/transformers/models/gpt_sw3/tokenization_gpt_sw3.py
JoeyOhman Dec 9, 2022
c97dca8
Update src/transformers/models/gpt_sw3/__init__.py
JoeyOhman Dec 9, 2022
1d09a6b
Clean up and protect GPTSw3Tokenizer imports with is_sentencepiece_av…
JoeyOhman Dec 9, 2022
62a41e8
Make style, make quality
JoeyOhman Dec 9, 2022
26930a4
Add dummy object for GPTSw3Tokenizer via 'make fix-copies'
JoeyOhman Dec 9, 2022
c6b754d
Merge remote-tracking branch 'upstream/main' into add_gpt_sw3
JoeyOhman Dec 9, 2022
0201440
make fix-copies
JoeyOhman Dec 9, 2022
dc6ce32
Remove GPTSw3 modeling classes
JoeyOhman Dec 9, 2022
ef1ec13
make style, make quality
JoeyOhman Dec 9, 2022
c475766
Add GPTSw3 auto-mappings for other GPT2 heads
JoeyOhman Dec 9, 2022
609b47c
Update docs/source/en/model_doc/gpt-sw3.mdx
JoeyOhman Dec 12, 2022
f247b6c
Update src/transformers/models/gpt_sw3/convert_megatron_to_pytorch.py
JoeyOhman Dec 12, 2022
b5bc165
Update src/transformers/models/gpt_sw3/tokenization_gpt_sw3.py
JoeyOhman Dec 12, 2022
965bd5e
Remove old TODO-comment
JoeyOhman Dec 12, 2022
6790be4
Add example usage to GPTSw3Tokenizer docstring
JoeyOhman Dec 12, 2022
81006ea
make style, make quality
JoeyOhman Dec 12, 2022
d9a1d9e
Add implementation details and example usage to gpt-sw3.mdx
JoeyOhman Dec 12, 2022
df4278c
Merge remote-tracking branch 'upstream/main' into add_gpt_sw3
JoeyOhman Dec 12, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -319,6 +319,8 @@ Current number of checkpoints: ![](https://img.shields.io/endpoint?url=https://h
1. **[GPT NeoX Japanese](https://huggingface.co/docs/transformers/model_doc/gpt_neox_japanese)** (from ABEJA) released by Shinya Otani, Takayoshi Makabe, Anuj Arora, and Kyo Hattori.
1. **[GPT-2](https://huggingface.co/docs/transformers/model_doc/gpt2)** (from OpenAI) released with the paper [Language Models are Unsupervised Multitask Learners](https://blog.openai.com/better-language-models/) by Alec Radford*, Jeffrey Wu*, Rewon Child, David Luan, Dario Amodei** and Ilya Sutskever**.
1. **[GPT-J](https://huggingface.co/docs/transformers/model_doc/gptj)** (from EleutherAI) released in the repository [kingoflolz/mesh-transformer-jax](https://github.com/kingoflolz/mesh-transformer-jax/) by Ben Wang and Aran Komatsuzaki.
1. **[gpt-sw3](https://huggingface.co/docs/transformers/main/model_doc/gpt-sw3)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>.
1. **[GPT-Sw3](https://huggingface.co/docs/transformers/main/model_doc/gpt-sw3)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>.
1. **[GroupViT](https://huggingface.co/docs/transformers/model_doc/groupvit)** (from UCSD, NVIDIA) released with the paper [GroupViT: Semantic Segmentation Emerges from Text Supervision](https://arxiv.org/abs/2202.11094) by Jiarui Xu, Shalini De Mello, Sifei Liu, Wonmin Byeon, Thomas Breuel, Jan Kautz, Xiaolong Wang.
1. **[Hubert](https://huggingface.co/docs/transformers/model_doc/hubert)** (from Facebook) released with the paper [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447) by Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed.
1. **[I-BERT](https://huggingface.co/docs/transformers/model_doc/ibert)** (from Berkeley) released with the paper [I-BERT: Integer-only BERT Quantization](https://arxiv.org/abs/2101.01321) by Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer.
Expand Down
2 changes: 2 additions & 0 deletions README_es.md
Original file line number Diff line number Diff line change
Expand Up @@ -319,6 +319,8 @@ Número actual de puntos de control: ![](https://img.shields.io/endpoint?url=htt
1. **[GPT NeoX Japanese](https://huggingface.co/docs/transformers/model_doc/gpt_neox_japanese)** (from ABEJA) released by Shinya Otani, Takayoshi Makabe, Anuj Arora, and Kyo Hattori.
1. **[GPT-2](https://huggingface.co/docs/transformers/model_doc/gpt2)** (from OpenAI) released with the paper [Language Models are Unsupervised Multitask Learners](https://blog.openai.com/better-language-models/) by Alec Radford*, Jeffrey Wu*, Rewon Child, David Luan, Dario Amodei** and Ilya Sutskever**.
1. **[GPT-J](https://huggingface.co/docs/transformers/model_doc/gptj)** (from EleutherAI) released in the repository [kingoflolz/mesh-transformer-jax](https://github.com/kingoflolz/mesh-transformer-jax/) by Ben Wang and Aran Komatsuzaki.
1. **[gpt-sw3](https://huggingface.co/docs/transformers/main/model_doc/gpt-sw3)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>.
1. **[GPT-Sw3](https://huggingface.co/docs/transformers/main/model_doc/gpt-sw3)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>.
1. **[GroupViT](https://huggingface.co/docs/transformers/model_doc/groupvit)** (from UCSD, NVIDIA) released with the paper [GroupViT: Semantic Segmentation Emerges from Text Supervision](https://arxiv.org/abs/2202.11094) by Jiarui Xu, Shalini De Mello, Sifei Liu, Wonmin Byeon, Thomas Breuel, Jan Kautz, Xiaolong Wang.
1. **[Hubert](https://huggingface.co/docs/transformers/model_doc/hubert)** (from Facebook) released with the paper [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447) by Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed.
1. **[I-BERT](https://huggingface.co/docs/transformers/model_doc/ibert)** (from Berkeley) released with the paper [I-BERT: Integer-only BERT Quantization](https://arxiv.org/abs/2101.01321) by Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer.
Expand Down
2 changes: 2 additions & 0 deletions README_ja.md
Original file line number Diff line number Diff line change
Expand Up @@ -354,6 +354,8 @@ Flax、PyTorch、TensorFlowをcondaでインストールする方法は、それ
1. **[GPT NeoX Japanese](https://huggingface.co/docs/transformers/model_doc/gpt_neox_japanese)** (from ABEJA) released by Shinya Otani, Takayoshi Makabe, Anuj Arora, and Kyo Hattori.
1. **[GPT-2](https://huggingface.co/docs/transformers/model_doc/gpt2)** (from OpenAI) released with the paper [Language Models are Unsupervised Multitask Learners](https://blog.openai.com/better-language-models/) by Alec Radford*, Jeffrey Wu*, Rewon Child, David Luan, Dario Amodei** and Ilya Sutskever**.
1. **[GPT-J](https://huggingface.co/docs/transformers/model_doc/gptj)** (from EleutherAI) released in the repository [kingoflolz/mesh-transformer-jax](https://github.com/kingoflolz/mesh-transformer-jax/) by Ben Wang and Aran Komatsuzaki.
1. **[gpt-sw3](https://huggingface.co/docs/transformers/main/model_doc/gpt-sw3)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>.
1. **[GPT-Sw3](https://huggingface.co/docs/transformers/main/model_doc/gpt-sw3)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>.
1. **[GroupViT](https://huggingface.co/docs/transformers/model_doc/groupvit)** (from UCSD, NVIDIA) released with the paper [GroupViT: Semantic Segmentation Emerges from Text Supervision](https://arxiv.org/abs/2202.11094) by Jiarui Xu, Shalini De Mello, Sifei Liu, Wonmin Byeon, Thomas Breuel, Jan Kautz, Xiaolong Wang.
1. **[Hubert](https://huggingface.co/docs/transformers/model_doc/hubert)** (from Facebook) released with the paper [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447) by Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed.
1. **[I-BERT](https://huggingface.co/docs/transformers/model_doc/ibert)** (from Berkeley) released with the paper [I-BERT: Integer-only BERT Quantization](https://arxiv.org/abs/2101.01321) by Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer.
Expand Down
2 changes: 2 additions & 0 deletions README_ko.md
Original file line number Diff line number Diff line change
Expand Up @@ -269,6 +269,8 @@ Flax, PyTorch, TensorFlow 설치 페이지에서 이들을 conda로 설치하는
1. **[GPT NeoX Japanese](https://huggingface.co/docs/transformers/model_doc/gpt_neox_japanese)** (from ABEJA) released by Shinya Otani, Takayoshi Makabe, Anuj Arora, and Kyo Hattori.
1. **[GPT-2](https://huggingface.co/docs/transformers/model_doc/gpt2)** (from OpenAI) released with the paper [Language Models are Unsupervised Multitask Learners](https://blog.openai.com/better-language-models/) by Alec Radford*, Jeffrey Wu*, Rewon Child, David Luan, Dario Amodei** and Ilya Sutskever**.
1. **[GPT-J](https://huggingface.co/docs/transformers/model_doc/gptj)** (from EleutherAI) released in the repository [kingoflolz/mesh-transformer-jax](https://github.com/kingoflolz/mesh-transformer-jax/) by Ben Wang and Aran Komatsuzaki.
1. **[gpt-sw3](https://huggingface.co/docs/transformers/main/model_doc/gpt-sw3)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>.
1. **[GPT-Sw3](https://huggingface.co/docs/transformers/main/model_doc/gpt-sw3)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>.
1. **[GroupViT](https://huggingface.co/docs/transformers/model_doc/groupvit)** (from UCSD, NVIDIA) released with the paper [GroupViT: Semantic Segmentation Emerges from Text Supervision](https://arxiv.org/abs/2202.11094) by Jiarui Xu, Shalini De Mello, Sifei Liu, Wonmin Byeon, Thomas Breuel, Jan Kautz, Xiaolong Wang.
1. **[Hubert](https://huggingface.co/docs/transformers/model_doc/hubert)** (from Facebook) released with the paper [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447) by Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed.
1. **[I-BERT](https://huggingface.co/docs/transformers/model_doc/ibert)** (from Berkeley) released with the paper [I-BERT: Integer-only BERT Quantization](https://arxiv.org/abs/2101.01321) by Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer.
Expand Down
2 changes: 2 additions & 0 deletions README_zh-hans.md
Original file line number Diff line number Diff line change
Expand Up @@ -293,6 +293,8 @@ conda install -c huggingface transformers
1. **[GPT NeoX Japanese](https://huggingface.co/docs/transformers/model_doc/gpt_neox_japanese)** (来自 ABEJA) 由 Shinya Otani, Takayoshi Makabe, Anuj Arora, Kyo Hattori。
1. **[GPT-2](https://huggingface.co/docs/transformers/model_doc/gpt2)** (来自 OpenAI) 伴随论文 [Language Models are Unsupervised Multitask Learners](https://blog.openai.com/better-language-models/) 由 Alec Radford*, Jeffrey Wu*, Rewon Child, David Luan, Dario Amodei** and Ilya Sutskever** 发布。
1. **[GPT-J](https://huggingface.co/docs/transformers/model_doc/gptj)** (来自 EleutherAI) 伴随论文 [kingoflolz/mesh-transformer-jax](https://github.com/kingoflolz/mesh-transformer-jax/) 由 Ben Wang and Aran Komatsuzaki 发布。
1. **[gpt-sw3](https://huggingface.co/docs/transformers/main/model_doc/gpt-sw3)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>.
1. **[GPT-Sw3](https://huggingface.co/docs/transformers/main/model_doc/gpt-sw3)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>.
1. **[GroupViT](https://huggingface.co/docs/transformers/model_doc/groupvit)** (来自 UCSD, NVIDIA) 伴随论文 [GroupViT: Semantic Segmentation Emerges from Text Supervision](https://arxiv.org/abs/2202.11094) 由 Jiarui Xu, Shalini De Mello, Sifei Liu, Wonmin Byeon, Thomas Breuel, Jan Kautz, Xiaolong Wang 发布。
1. **[Hubert](https://huggingface.co/docs/transformers/model_doc/hubert)** (来自 Facebook) 伴随论文 [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447) 由 Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed 发布。
1. **[I-BERT](https://huggingface.co/docs/transformers/model_doc/ibert)** (来自 Berkeley) 伴随论文 [I-BERT: Integer-only BERT Quantization](https://arxiv.org/abs/2101.01321) 由 Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer 发布。
Expand Down
2 changes: 2 additions & 0 deletions README_zh-hant.md
Original file line number Diff line number Diff line change
Expand Up @@ -305,6 +305,8 @@ conda install -c huggingface transformers
1. **[GPT NeoX Japanese](https://huggingface.co/docs/transformers/model_doc/gpt_neox_japanese)** (from ABEJA) released by Shinya Otani, Takayoshi Makabe, Anuj Arora, and Kyo Hattori.
1. **[GPT-2](https://huggingface.co/docs/transformers/model_doc/gpt2)** (from OpenAI) released with the paper [Language Models are Unsupervised Multitask Learners](https://blog.openai.com/better-language-models/) by Alec Radford*, Jeffrey Wu*, Rewon Child, David Luan, Dario Amodei** and Ilya Sutskever**.
1. **[GPT-J](https://huggingface.co/docs/transformers/model_doc/gptj)** (from EleutherAI) released with the paper [kingoflolz/mesh-transformer-jax](https://github.com/kingoflolz/mesh-transformer-jax/) by Ben Wang and Aran Komatsuzaki.
1. **[gpt-sw3](https://huggingface.co/docs/transformers/main/model_doc/gpt-sw3)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>.
1. **[GPT-Sw3](https://huggingface.co/docs/transformers/main/model_doc/gpt-sw3)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>.
1. **[GroupViT](https://huggingface.co/docs/transformers/model_doc/groupvit)** (from UCSD, NVIDIA) released with the paper [GroupViT: Semantic Segmentation Emerges from Text Supervision](https://arxiv.org/abs/2202.11094) by Jiarui Xu, Shalini De Mello, Sifei Liu, Wonmin Byeon, Thomas Breuel, Jan Kautz, Xiaolong Wang.
1. **[Hubert](https://huggingface.co/docs/transformers/model_doc/hubert)** (from Facebook) released with the paper [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447) by Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed.
1. **[I-BERT](https://huggingface.co/docs/transformers/model_doc/ibert)** (from Berkeley) released with the paper [I-BERT: Integer-only BERT Quantization](https://arxiv.org/abs/2101.01321) by Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer.
Expand Down
2 changes: 2 additions & 0 deletions docs/source/en/_toctree.yml
Original file line number Diff line number Diff line change
Expand Up @@ -271,6 +271,8 @@
title: GPT-J
- local: model_doc/gpt2
title: GPT2
- local: model_doc/gpt-sw3
title: GPTSw3
- local: model_doc/herbert
title: HerBERT
- local: model_doc/ibert
Expand Down
3 changes: 3 additions & 0 deletions docs/source/en/index.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,8 @@ The documentation is organized into five sections:
1. **[GPT NeoX Japanese](model_doc/gpt_neox_japanese)** (from ABEJA) released by Shinya Otani, Takayoshi Makabe, Anuj Arora, and Kyo Hattori.
1. **[GPT-2](model_doc/gpt2)** (from OpenAI) released with the paper [Language Models are Unsupervised Multitask Learners](https://blog.openai.com/better-language-models/) by Alec Radford*, Jeffrey Wu*, Rewon Child, David Luan, Dario Amodei** and Ilya Sutskever**.
1. **[GPT-J](model_doc/gptj)** (from EleutherAI) released in the repository [kingoflolz/mesh-transformer-jax](https://github.com/kingoflolz/mesh-transformer-jax/) by Ben Wang and Aran Komatsuzaki.
1. **[gpt-sw3](model_doc/gpt-sw3)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>.
1. **[GPT-Sw3](model_doc/gpt-sw3)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>.
1. **[GroupViT](model_doc/groupvit)** (from UCSD, NVIDIA) released with the paper [GroupViT: Semantic Segmentation Emerges from Text Supervision](https://arxiv.org/abs/2202.11094) by Jiarui Xu, Shalini De Mello, Sifei Liu, Wonmin Byeon, Thomas Breuel, Jan Kautz, Xiaolong Wang.
1. **[Hubert](model_doc/hubert)** (from Facebook) released with the paper [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447) by Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed.
1. **[I-BERT](model_doc/ibert)** (from Berkeley) released with the paper [I-BERT: Integer-only BERT Quantization](https://arxiv.org/abs/2101.01321) by Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer.
Expand Down Expand Up @@ -270,6 +272,7 @@ Flax), PyTorch, and/or TensorFlow.
| GPT NeoX | ❌ | ✅ | ✅ | ❌ | ❌ |
| GPT NeoX Japanese | ✅ | ❌ | ✅ | ❌ | ❌ |
| GPT-J | ❌ | ❌ | ✅ | ✅ | ✅ |
| GPT-Sw3 | ✅ | ❌ | ✅ | ❌ | ❌ |
| GroupViT | ❌ | ❌ | ✅ | ✅ | ❌ |
| Hubert | ❌ | ❌ | ✅ | ✅ | ❌ |
| I-BERT | ❌ | ❌ | ✅ | ❌ | ❌ |
Expand Down
72 changes: 72 additions & 0 deletions docs/source/en/model_doc/gpt-sw3.mdx
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
<!--Copyright 2022 The HuggingFace Team. All rights reserved.

Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
specific language governing permissions and limitations under the License.
-->

# GPT-Sw3

## Overview

The GPT-Sw3 model was first proposed in
[Lessons Learned from GPT-SW3: Building the First Large-Scale Generative Language Model for Swedish](http://www.lrec-conf.org/proceedings/lrec2022/pdf/2022.lrec-1.376.pdf)
by Ariel Ekgren, Amaru Cuba Gyllensten, Evangelia Gogoulou, Alice Heiman, Severine Verlinden, Joey Öhman,
Fredrik Carlsson, Magnus Sahlgren.

Since that first paper we have extended our work and trained new models on our new 1.2TB corpora named The Nordic Pile.

GPT-SW3 is a collection of large decoder-only pretrained transformer language models that were developed by AI Sweden
in collaboration with RISE and the WASP WARA for Media and Language. GPT-SW3 has been trained on a dataset containing
320B tokens in Swedish, Norwegian, Danish, Icelandic, English, and programming code. The model was pretrained using a
causal language modeling (CLM) objective utilizing the NeMo Megatron GPT implementation.

This model was contributed by [AI Sweden](https://huggingface.co/AI-Sweden).


## GPTSw3Config

[[autodoc]] GPTSw3Config

## GPTSw3Tokenizer

[[autodoc]] GPTSw3Tokenizer
- save_vocabulary

## GPTSw3 specific outputs

[[autodoc]] models.gpt_sw3.modeling_gpt_sw3.GPTSw3DoubleHeadsModelOutput

## GPTSw3Model

[[autodoc]] GPTSw3Model
- forward
- parallelize
- deparallelize

## GPTSw3LMHeadModel

[[autodoc]] GPTSw3LMHeadModel
- forward
- parallelize
- deparallelize

## GPTSw3DoubleHeadsModel

[[autodoc]] GPTSw3DoubleHeadsModel
- forward

## GPTSw3ForSequenceClassification

[[autodoc]] GPTSw3ForSequenceClassification
- forward

## GPTSw3ForTokenClassification

[[autodoc]] GPTSw3ForTokenClassification
- forward
22 changes: 22 additions & 0 deletions src/transformers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -251,6 +251,7 @@
"models.gpt_neo": ["GPT_NEO_PRETRAINED_CONFIG_ARCHIVE_MAP", "GPTNeoConfig"],
"models.gpt_neox": ["GPT_NEOX_PRETRAINED_CONFIG_ARCHIVE_MAP", "GPTNeoXConfig"],
"models.gpt_neox_japanese": ["GPT_NEOX_JAPANESE_PRETRAINED_CONFIG_ARCHIVE_MAP", "GPTNeoXJapaneseConfig"],
"models.gpt_sw3": ["GPT_SW3_PRETRAINED_CONFIG_ARCHIVE_MAP", "GPTSw3Config", "GPTSw3Tokenizer"],
"models.gptj": ["GPTJ_PRETRAINED_CONFIG_ARCHIVE_MAP", "GPTJConfig"],
"models.groupvit": [
"GROUPVIT_PRETRAINED_CONFIG_ARCHIVE_MAP",
Expand Down Expand Up @@ -1441,6 +1442,17 @@
"GPTNeoXJapanesePreTrainedModel",
]
)
_import_structure["models.gpt_sw3"].extend(
[
"GPT_SW3_PRETRAINED_MODEL_ARCHIVE_LIST",
"GPTSw3DoubleHeadsModel",
"GPTSw3ForSequenceClassification",
"GPTSw3ForTokenClassification",
"GPTSw3LMHeadModel",
"GPTSw3Model",
"GPTSw3PreTrainedModel",
]
)
_import_structure["models.gptj"].extend(
[
"GPTJ_PRETRAINED_MODEL_ARCHIVE_LIST",
Expand Down Expand Up @@ -3470,6 +3482,7 @@
from .models.gpt_neo import GPT_NEO_PRETRAINED_CONFIG_ARCHIVE_MAP, GPTNeoConfig
from .models.gpt_neox import GPT_NEOX_PRETRAINED_CONFIG_ARCHIVE_MAP, GPTNeoXConfig
from .models.gpt_neox_japanese import GPT_NEOX_JAPANESE_PRETRAINED_CONFIG_ARCHIVE_MAP, GPTNeoXJapaneseConfig
from .models.gpt_sw3 import GPT_SW3_PRETRAINED_CONFIG_ARCHIVE_MAP, GPTSw3Config, GPTSw3Tokenizer
from .models.gptj import GPTJ_PRETRAINED_CONFIG_ARCHIVE_MAP, GPTJConfig
from .models.groupvit import (
GROUPVIT_PRETRAINED_CONFIG_ARCHIVE_MAP,
Expand Down Expand Up @@ -4478,6 +4491,15 @@
GPTNeoXJapaneseModel,
GPTNeoXJapanesePreTrainedModel,
)
from .models.gpt_sw3 import (
GPT_SW3_PRETRAINED_MODEL_ARCHIVE_LIST,
GPTSw3DoubleHeadsModel,
GPTSw3ForSequenceClassification,
GPTSw3ForTokenClassification,
GPTSw3LMHeadModel,
GPTSw3Model,
GPTSw3PreTrainedModel,
)
from .models.gptj import (
GPTJ_PRETRAINED_MODEL_ARCHIVE_LIST,
GPTJForCausalLM,
Expand Down
Loading