Skip to content

Commit

Permalink
Add hard-coded version of final pretraining task for use without gin …
Browse files Browse the repository at this point in the history
…configs. Fixes #269.

PiperOrigin-RevId: 317701286
  • Loading branch information
adarob authored and t5-copybara committed Jun 22, 2020
1 parent ec1aba1 commit 897ea73
Showing 1 changed file with 75 additions and 0 deletions.
75 changes: 75 additions & 0 deletions t5/data/tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@
}

# ==================================== C4 ======================================
# Configurable tasks used for comparisons in Raffel et al., 2019.
_c4_config_suffixes = ["", ".noclean", ".realnewslike", ".webtextlike"]
for config_suffix in _c4_config_suffixes:
TaskRegistry.add(
Expand All @@ -47,6 +48,80 @@
output_features=DEFAULT_OUTPUT_FEATURES,
metric_fns=[])

# Final pretraining task used in Raffel et al., 2019.
TaskRegistry.add(
"c4_v220_span_corruption",
TfdsTask,
tfds_name="c4/en:2.2.0".format(config=config_suffix),
text_preprocessor=functools.partial(
preprocessors.rekey, key_map={"inputs": None, "targets": "text"}),
token_preprocessor=[
functools.partial(
preprocessors.select_random_chunk,
feature_key="targets",
max_length=65536
),
functools.partial(
preprocessors.reduce_concat_tokens,
feature_key="targets",
batch_size=128
),
functools.partial(
preprocessors.split_tokens,
feature_key="targets",
min_tokens_per_segment=None,
max_tokens_per_segment=preprocessors.random_spans_helper(
extra_tokens_per_span_inputs=1,
extra_tokens_per_span_targets=1,
inputs_length=512,
mean_noise_span_length=3.0,
noise_density=0.15
)[0]
),
functools.partial(
preprocessors.denoise,
inputs_fn=preprocessors.noise_span_to_unique_sentinel,
targets_fn=preprocessors.nonnoise_span_to_unique_sentinel,
noise_density=0.15,
noise_function=functools.partial(
preprocessors.random_spans_noise_mask,
mean_noise_span_length=3.0
)
)
],
output_features=DEFAULT_OUTPUT_FEATURES,
metric_fns=[])

# Baseline pretraining task used in Raffel et al., 2019.
TaskRegistry.add(
"c4_v220_iid_denoising",
TfdsTask,
tfds_name="c4/en:2.2.0".format(config=config_suffix),
text_preprocessor=functools.partial(
preprocessors.rekey, key_map={"inputs": None, "targets": "text"}),
token_preprocessor=[
functools.partial(
preprocessors.select_random_chunk,
feature_key="targets",
max_length=65536
),
functools.partial(
preprocessors.reduce_concat_tokens,
feature_key="targets",
batch_size=128
),
preprocessors.split_tokens_to_inputs_length,
functools.partial(
preprocessors.denoise,
inputs_fn=preprocessors.noise_span_to_unique_sentinel,
targets_fn=preprocessors.nonnoise_span_to_unique_sentinel,
noise_density=0.15,
noise_function=preprocessors.iid_noise_mask
)
],
output_features=DEFAULT_OUTPUT_FEATURES,
metric_fns=[])

# ================================ Wikipedia ===================================
TaskRegistry.add(
"wikipedia_20190301.en_v003_unsupervised",
Expand Down

0 comments on commit 897ea73

Please sign in to comment.