From 0fd37f5dfffb5acee921ada9f60d69228a672613 Mon Sep 17 00:00:00 2001 From: lucidrains Date: Fri, 20 Dec 2024 08:27:26 -0800 Subject: [PATCH] default learned value residual mix to true --- setup.py | 2 +- x_transformers/x_transformers.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/setup.py b/setup.py index 64fb6201..0adb3c27 100644 --- a/setup.py +++ b/setup.py @@ -3,7 +3,7 @@ setup( name = 'x-transformers', packages = find_packages(exclude=['examples']), - version = '1.42.26', + version = '1.42.28', license='MIT', description = 'X-Transformers - Pytorch', author = 'Phil Wang', diff --git a/x_transformers/x_transformers.py b/x_transformers/x_transformers.py index c2acaf24..4a4e865d 100644 --- a/x_transformers/x_transformers.py +++ b/x_transformers/x_transformers.py @@ -1584,7 +1584,7 @@ def __init__( unet_skips = False, reinject_input = False, # seen first in DEQ paper https://arxiv.org/abs/1909.01377, but later used in a number of papers trying to achieve depthwise generalization https://arxiv.org/abs/2410.03020v1 add_value_residual = False, # resformer from Zhou et al - https://arxiv.org/abs/2410.17897v1 - learned_value_residual_mix = False, # seeing big improvements when the value residual mix value is learned per token - credit goes to @faresobeid for taking the first step with learned scalar mix, then @Blinkdl for taking it a step further with data dependent. here we will use per token learned + learned_value_residual_mix = True, # seeing big improvements when the value residual mix value is learned per token - credit goes to @faresobeid for taking the first step with learned scalar mix, then @Blinkdl for taking it a step further with data dependent. here we will use per token learned rel_pos_kwargs: dict = dict(), **kwargs ):