Skip to content

Commit

Permalink
Add huggingface dataset example
Browse files Browse the repository at this point in the history
  • Loading branch information
satyaog committed May 1, 2023
1 parent 559c7e0 commit b39e455
Show file tree
Hide file tree
Showing 11 changed files with 1,010 additions and 0 deletions.
452 changes: 452 additions & 0 deletions docs/examples/data/hf/README.rst

Large diffs are not rendered by default.

52 changes: 52 additions & 0 deletions docs/examples/data/hf/_index.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
Hugging Face Dataset
====================


**Prerequisites**

Make sure to read the following sections of the documentation before using this
example:

* :ref:`pytorch_setup`
* :ref:`001 - Single GPU Job`

The full source code for this example is available on `the mila-docs GitHub
repository.
<https://github.com/mila-iqia/mila-docs/tree/master/docs/examples/data/hf>`_


**job.sh**

.. literalinclude:: examples/data/hf/job.sh.diff
:language: diff


**main.py**

.. literalinclude:: examples/data/hf/main.py.diff
:language: diff


**prepare_data.py**

.. literalinclude:: examples/data/hf/prepare_data.py
:language: python


**data.sh**

.. literalinclude:: examples/data/hf/data.sh
:language: bash


**get_dataset_cache_files.py**

.. literalinclude:: examples/data/hf/get_dataset_cache_files.py
:language: python


**Running this example**

.. code-block:: bash
$ sbatch job.sh
17 changes: 17 additions & 0 deletions docs/examples/data/hf/data.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
#!/bin/bash
set -o errexit

_SRC=$1
_DEST=$2
_WORKERS=$3

# Clone the dataset structure (not the data itself) locally so HF can find the
# cache hashes it looks for. Else HF might think he needs to redo some
# preprocessing. Directories will be created and symlinks will replace the files
bash sh_utils.sh ln_files "${_SRC}" "${_DEST}" $_WORKERS

# Copy the preprocessed dataset to compute node's local dataset cache dir so it
# is close to the GPUs for faster training. Since HF can very easily change the
# hash to reference a preprocessed dataset, we only copy the data for the
# current preprocess pipeline.
python3 get_dataset_cache_files.py | bash sh_utils.sh cp_files "${_SRC}" "${_DEST}" $_WORKERS
35 changes: 35 additions & 0 deletions docs/examples/data/hf/get_dataset_cache_files.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
"""List to stdout the files of the dataset"""

from pathlib import Path
import sys

import datasets

from py_utils import (
get_dataset_builder, get_num_workers, get_raw_datasets, get_tokenizer,
preprocess_datasets
)


if __name__ == "__main__":
# Redirect outputs to stderr to avoid noize in stdout
_stdout = sys.stdout
sys.stdout = sys.stderr

try:
_CACHE_DIR = sys.argv[1]
except IndexError:
_CACHE_DIR = datasets.config.HF_DATASETS_CACHE
try:
_WORKERS = int(sys.argv[2])
except IndexError:
_WORKERS = get_num_workers()

cache_dir = Path(_CACHE_DIR)
builder = get_dataset_builder(cache_dir=_CACHE_DIR)
raw_datasets = get_raw_datasets(builder)
tokenizer = get_tokenizer()
for dataset in preprocess_datasets(tokenizer, raw_datasets, num_workers=_WORKERS).values():
for cache_file in dataset.cache_files:
cache_file = Path(cache_file["filename"]).relative_to(cache_dir)
print(cache_file, file=_stdout)
102 changes: 102 additions & 0 deletions docs/examples/data/hf/job.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
#!/bin/bash
#SBATCH --gpus-per-task=rtx8000:1
#SBATCH --cpus-per-task=12
#SBATCH --ntasks-per-node=1
#SBATCH --mem=48G
#SBATCH --time=04:00:00
#SBATCH --tmp=1500G
set -o errexit

function wrap_cmd {
for a in "$@"
do
echo -n "\"$a\" "
done
}


# Echo time and hostname into log
echo "Date: $(date)"
echo "Hostname: $(hostname)"


# Ensure only anaconda/3 module loaded.
module --quiet purge
# This example uses Conda to manage package dependencies.
# See https://docs.mila.quebec/Userguide.html#conda for more information.
module load anaconda/3
module load cuda/11.7


# Creating the environment for the first time:
# conda create -y -n pytorch python=3.9 pytorch torchvision torchaudio \
# pytorch-cuda=11.7 scipy -c pytorch -c nvidia
# Other conda packages:
# conda install -y -n pytorch -c conda-forge rich tqdm datasets

# Activate pre-existing environment.
conda activate pytorch


if [[ -z "$HF_DATASETS_CACHE" ]]
then
# Store the huggingface datasets cache in $SCRATCH
export HF_DATASETS_CACHE=$SCRATCH/cache/huggingface/datasets
fi
if [[ -z "$HUGGINGFACE_HUB_CACHE" ]]
then
# Store the huggingface hub cache in $SCRATCH
export HUGGINGFACE_HUB_CACHE=$SCRATCH/cache/huggingface/hub
fi
if [[ -z "$_DATA_PREP_WORKERS" ]]
then
_DATA_PREP_WORKERS=$SLURM_JOB_CPUS_PER_NODE
fi
if [[ -z "$_DATA_PREP_WORKERS" ]]
then
_DATA_PREP_WORKERS=16
fi


# Preprocess the dataset and cache the result such that the heavy work is done
# only once *ever*
# Required conda packages:
# conda install -y -c conda-forge zstandard
srun --ntasks=1 --ntasks-per-node=1 \
time -p python3 prepare_data.py "/network/datasets/pile" $_DATA_PREP_WORKERS


# Copy the preprocessed dataset to $SLURM_TMPDIR so it is close to the GPUs for
# faster training. This should be done once per compute node
cmd=(
# Having 'bash' here allows the execution of a script file which might not
# have the execution flag on
bash data.sh
# The current dataset cache dir
"$HF_DATASETS_CACHE"
# The local dataset cache dir
# Use '' to lazy expand the expression such as $SLURM_TMPDIR will be
# interpreted on the local compute node rather than the master node
'$SLURM_TMPDIR/data'
$_DATA_PREP_WORKERS
)
# 'time' will objectively give a measure for the copy of the dataset. This can
# be used to compare the timing of multiple code versionw and make sure any slow
# down doesn't come from the code itself.
srun --ntasks=$SLURM_JOB_NUM_NODES --ntasks-per-node=1 \
time -p bash -c "$(wrap_cmd "${cmd[@]}")"


# Fixes issues with MIG-ed GPUs with versions of PyTorch < 2.0
unset CUDA_VISIBLE_DEVICES

# Execute Python script
env_var=(
# Use the local copy of the preprocessed dataset
HF_DATASETS_CACHE='"$SLURM_TMPDIR/data"'
)
cmd=(
python3
main.py
)
srun bash -c "$(echo "${env_var[@]}") $(wrap_cmd "${cmd[@]}")"
114 changes: 114 additions & 0 deletions docs/examples/data/hf/main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
"""HuggingFace training example."""
import logging

import rich.logging
import torch
from torch import nn
from torch.utils.data import DataLoader
from tqdm import tqdm

from py_utils import (
get_dataset_builder, get_num_workers, get_raw_datasets, get_tokenizer,
preprocess_datasets
)


def main():
training_epochs = 1
batch_size = 256

# Check that the GPU is available
assert torch.cuda.is_available() and torch.cuda.device_count() > 0
device = torch.device("cuda", 0)

# Setup logging (optional, but much better than using print statements)
logging.basicConfig(
level=logging.INFO,
handlers=[rich.logging.RichHandler(markup=True)], # Very pretty, uses the `rich` package.
)

logger = logging.getLogger(__name__)

# Setup ImageNet
num_workers = get_num_workers()
train_dataset, valid_dataset, test_dataset = make_datasets(num_workers)
train_dataloader = DataLoader(
train_dataset,
batch_size=batch_size,
num_workers=num_workers,
shuffle=True,
)
valid_dataloader = DataLoader(
valid_dataset,
batch_size=batch_size,
num_workers=num_workers,
shuffle=False,
)
test_dataloader = DataLoader( # NOTE: Not used in this example.
test_dataset,
batch_size=batch_size,
num_workers=num_workers,
shuffle=False,
)

# Checkout the "checkpointing and preemption" example for more info!
logger.debug("Starting training from scratch.")

for epoch in range(training_epochs):
logger.debug(f"Starting epoch {epoch}/{training_epochs}")

# NOTE: using a progress bar from tqdm because it's nicer than using `print`.
progress_bar = tqdm(
total=len(train_dataloader),
desc=f"Train epoch {epoch}",
)

# Training loop
for batch in train_dataloader:
# Move the batch to the GPU before we pass it to the model
batch = {k:item.to(device) for k, item in batch.items()}

# [Training of the model goes here]

# Advance the progress bar one step, and update the "postfix" () the progress bar. (nicer than just)
progress_bar.update(1)
progress_bar.close()

val_loss, val_accuracy = validation_loop(None, valid_dataloader, device)
logger.info(f"Epoch {epoch}: Val loss: {val_loss:.3f} accuracy: {val_accuracy:.2%}")

print("Done!")


@torch.no_grad()
def validation_loop(model: nn.Module, dataloader: DataLoader, device: torch.device):
total_loss = 0.0
n_samples = 0
correct_predictions = 0

for batch in dataloader:
batch = {k:item.to(device) for k, item in batch.items()}

batch_n_samples = batch["input_ids"].data.shape[0]

n_samples += batch_n_samples

accuracy = correct_predictions / n_samples
return total_loss, accuracy


def make_datasets(num_workers:int=None):
"""Returns the training, validation, and test splits for the prepared dataset.
"""
builder = get_dataset_builder()
raw_datasets = get_raw_datasets(builder)
tokenizer = get_tokenizer()
preprocessed_datasets = preprocess_datasets(tokenizer, raw_datasets, num_workers=num_workers)
return (
preprocessed_datasets["train"], preprocessed_datasets["validation"],
preprocessed_datasets["test"]
)


if __name__ == "__main__":
main()
30 changes: 30 additions & 0 deletions docs/examples/data/hf/prepare_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
"""Preprocess the dataset.
In this example, HuggingFace is used and the resulting dataset will be stored in
$HF_DATASETS_CACHE. It is preferable to set the datasets cache to a location in
$SCRATCH"""

from py_utils import (
get_config, get_dataset_builder, get_num_workers, get_raw_datasets,
get_tokenizer, preprocess_datasets
)


if __name__ == "__main__":
import sys
import time

_LOCAL_DS = sys.argv[1]
try:
_WORKERS = int(sys.argv[2])
except IndexError:
_WORKERS = get_num_workers()

t = -time.time()
_ = get_config()
builder = get_dataset_builder(local_dataset=_LOCAL_DS, num_workers=_WORKERS)
raw_datasets = get_raw_datasets(builder)
tokenizer = get_tokenizer()
_ = preprocess_datasets(tokenizer, raw_datasets, num_workers=_WORKERS)
t += time.time()

print(f"Prepared data in {t/60:.2f}m")
Loading

0 comments on commit b39e455

Please sign in to comment.