diff --git a/t5/data/tasks.py b/t5/data/tasks.py index e6be58f9..b79f38d6 100644 --- a/t5/data/tasks.py +++ b/t5/data/tasks.py @@ -34,6 +34,7 @@ } # ==================================== C4 ====================================== +# Configurable tasks used for comparisons in Raffel et al., 2019. _c4_config_suffixes = ["", ".noclean", ".realnewslike", ".webtextlike"] for config_suffix in _c4_config_suffixes: TaskRegistry.add( @@ -47,6 +48,50 @@ output_features=DEFAULT_OUTPUT_FEATURES, metric_fns=[]) +# Final pretraining task used in Raffel et al., 2019. +TaskRegistry.add( + "c4_v220_span_corruption", + TfdsTask, + tfds_name="c4/en:2.2.0".format(config=config_suffix), + text_preprocessor=functools.partial( + preprocessors.rekey, key_map={"inputs": None, "targets": "text"}), + token_preprocessor=[ + functools.partial( + preprocessors.select_random_chunk, + feature_key="targets", + max_length=65536 + ), + functools.partial( + preprocessors.reduce_concat_tokens, + feature_key="targets", + batch_size=128 + ), + functools.partial( + preprocessors.split_tokens, + feature_key="targets", + min_tokens_per_segment=None, + max_tokens_per_segment=preprocessors.random_spans_helper( + extra_tokens_per_span_inputs=1, + extra_tokens_per_span_targets=1, + inputs_length=512, + mean_noise_span_length=3.0, + noise_density=0.15 + )[0] + ), + functools.partial( + preprocessors.denoise, + inputs_fn=preprocessors.noise_span_to_unique_sentinel, + targets_fn=preprocessors.noise_span_to_unique_sentinel, + noise_density=0.15, + noise_function=functools.partial( + preprocessors.random_spans_noise_mask, + mean_noise_span_length=3.0 + ) + ) + ], + output_features=DEFAULT_OUTPUT_FEATURES, + metric_fns=[]) + # ================================ Wikipedia =================================== TaskRegistry.add( "wikipedia_20190301.en_v003_unsupervised",