-
Notifications
You must be signed in to change notification settings - Fork 23
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
11 changed files
with
1,010 additions
and
0 deletions.
There are no files selected for viewing
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,52 @@ | ||
Hugging Face Dataset | ||
==================== | ||
|
||
|
||
**Prerequisites** | ||
|
||
Make sure to read the following sections of the documentation before using this | ||
example: | ||
|
||
* :ref:`pytorch_setup` | ||
* :ref:`001 - Single GPU Job` | ||
|
||
The full source code for this example is available on `the mila-docs GitHub | ||
repository. | ||
<https://github.com/mila-iqia/mila-docs/tree/master/docs/examples/data/hf>`_ | ||
|
||
|
||
**job.sh** | ||
|
||
.. literalinclude:: examples/data/hf/job.sh.diff | ||
:language: diff | ||
|
||
|
||
**main.py** | ||
|
||
.. literalinclude:: examples/data/hf/main.py.diff | ||
:language: diff | ||
|
||
|
||
**prepare_data.py** | ||
|
||
.. literalinclude:: examples/data/hf/prepare_data.py | ||
:language: python | ||
|
||
|
||
**data.sh** | ||
|
||
.. literalinclude:: examples/data/hf/data.sh | ||
:language: bash | ||
|
||
|
||
**get_dataset_cache_files.py** | ||
|
||
.. literalinclude:: examples/data/hf/get_dataset_cache_files.py | ||
:language: python | ||
|
||
|
||
**Running this example** | ||
|
||
.. code-block:: bash | ||
$ sbatch job.sh |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,17 @@ | ||
#!/bin/bash | ||
set -o errexit | ||
|
||
_SRC=$1 | ||
_DEST=$2 | ||
_WORKERS=$3 | ||
|
||
# Clone the dataset structure (not the data itself) locally so HF can find the | ||
# cache hashes it looks for. Else HF might think he needs to redo some | ||
# preprocessing. Directories will be created and symlinks will replace the files | ||
bash sh_utils.sh ln_files "${_SRC}" "${_DEST}" $_WORKERS | ||
|
||
# Copy the preprocessed dataset to compute node's local dataset cache dir so it | ||
# is close to the GPUs for faster training. Since HF can very easily change the | ||
# hash to reference a preprocessed dataset, we only copy the data for the | ||
# current preprocess pipeline. | ||
python3 get_dataset_cache_files.py | bash sh_utils.sh cp_files "${_SRC}" "${_DEST}" $_WORKERS |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,35 @@ | ||
"""List to stdout the files of the dataset""" | ||
|
||
from pathlib import Path | ||
import sys | ||
|
||
import datasets | ||
|
||
from py_utils import ( | ||
get_dataset_builder, get_num_workers, get_raw_datasets, get_tokenizer, | ||
preprocess_datasets | ||
) | ||
|
||
|
||
if __name__ == "__main__": | ||
# Redirect outputs to stderr to avoid noize in stdout | ||
_stdout = sys.stdout | ||
sys.stdout = sys.stderr | ||
|
||
try: | ||
_CACHE_DIR = sys.argv[1] | ||
except IndexError: | ||
_CACHE_DIR = datasets.config.HF_DATASETS_CACHE | ||
try: | ||
_WORKERS = int(sys.argv[2]) | ||
except IndexError: | ||
_WORKERS = get_num_workers() | ||
|
||
cache_dir = Path(_CACHE_DIR) | ||
builder = get_dataset_builder(cache_dir=_CACHE_DIR) | ||
raw_datasets = get_raw_datasets(builder) | ||
tokenizer = get_tokenizer() | ||
for dataset in preprocess_datasets(tokenizer, raw_datasets, num_workers=_WORKERS).values(): | ||
for cache_file in dataset.cache_files: | ||
cache_file = Path(cache_file["filename"]).relative_to(cache_dir) | ||
print(cache_file, file=_stdout) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,102 @@ | ||
#!/bin/bash | ||
#SBATCH --gpus-per-task=rtx8000:1 | ||
#SBATCH --cpus-per-task=12 | ||
#SBATCH --ntasks-per-node=1 | ||
#SBATCH --mem=48G | ||
#SBATCH --time=04:00:00 | ||
#SBATCH --tmp=1500G | ||
set -o errexit | ||
|
||
function wrap_cmd { | ||
for a in "$@" | ||
do | ||
echo -n "\"$a\" " | ||
done | ||
} | ||
|
||
|
||
# Echo time and hostname into log | ||
echo "Date: $(date)" | ||
echo "Hostname: $(hostname)" | ||
|
||
|
||
# Ensure only anaconda/3 module loaded. | ||
module --quiet purge | ||
# This example uses Conda to manage package dependencies. | ||
# See https://docs.mila.quebec/Userguide.html#conda for more information. | ||
module load anaconda/3 | ||
module load cuda/11.7 | ||
|
||
|
||
# Creating the environment for the first time: | ||
# conda create -y -n pytorch python=3.9 pytorch torchvision torchaudio \ | ||
# pytorch-cuda=11.7 scipy -c pytorch -c nvidia | ||
# Other conda packages: | ||
# conda install -y -n pytorch -c conda-forge rich tqdm datasets | ||
|
||
# Activate pre-existing environment. | ||
conda activate pytorch | ||
|
||
|
||
if [[ -z "$HF_DATASETS_CACHE" ]] | ||
then | ||
# Store the huggingface datasets cache in $SCRATCH | ||
export HF_DATASETS_CACHE=$SCRATCH/cache/huggingface/datasets | ||
fi | ||
if [[ -z "$HUGGINGFACE_HUB_CACHE" ]] | ||
then | ||
# Store the huggingface hub cache in $SCRATCH | ||
export HUGGINGFACE_HUB_CACHE=$SCRATCH/cache/huggingface/hub | ||
fi | ||
if [[ -z "$_DATA_PREP_WORKERS" ]] | ||
then | ||
_DATA_PREP_WORKERS=$SLURM_JOB_CPUS_PER_NODE | ||
fi | ||
if [[ -z "$_DATA_PREP_WORKERS" ]] | ||
then | ||
_DATA_PREP_WORKERS=16 | ||
fi | ||
|
||
|
||
# Preprocess the dataset and cache the result such that the heavy work is done | ||
# only once *ever* | ||
# Required conda packages: | ||
# conda install -y -c conda-forge zstandard | ||
srun --ntasks=1 --ntasks-per-node=1 \ | ||
time -p python3 prepare_data.py "/network/datasets/pile" $_DATA_PREP_WORKERS | ||
|
||
|
||
# Copy the preprocessed dataset to $SLURM_TMPDIR so it is close to the GPUs for | ||
# faster training. This should be done once per compute node | ||
cmd=( | ||
# Having 'bash' here allows the execution of a script file which might not | ||
# have the execution flag on | ||
bash data.sh | ||
# The current dataset cache dir | ||
"$HF_DATASETS_CACHE" | ||
# The local dataset cache dir | ||
# Use '' to lazy expand the expression such as $SLURM_TMPDIR will be | ||
# interpreted on the local compute node rather than the master node | ||
'$SLURM_TMPDIR/data' | ||
$_DATA_PREP_WORKERS | ||
) | ||
# 'time' will objectively give a measure for the copy of the dataset. This can | ||
# be used to compare the timing of multiple code versionw and make sure any slow | ||
# down doesn't come from the code itself. | ||
srun --ntasks=$SLURM_JOB_NUM_NODES --ntasks-per-node=1 \ | ||
time -p bash -c "$(wrap_cmd "${cmd[@]}")" | ||
|
||
|
||
# Fixes issues with MIG-ed GPUs with versions of PyTorch < 2.0 | ||
unset CUDA_VISIBLE_DEVICES | ||
|
||
# Execute Python script | ||
env_var=( | ||
# Use the local copy of the preprocessed dataset | ||
HF_DATASETS_CACHE='"$SLURM_TMPDIR/data"' | ||
) | ||
cmd=( | ||
python3 | ||
main.py | ||
) | ||
srun bash -c "$(echo "${env_var[@]}") $(wrap_cmd "${cmd[@]}")" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,114 @@ | ||
"""HuggingFace training example.""" | ||
import logging | ||
|
||
import rich.logging | ||
import torch | ||
from torch import nn | ||
from torch.utils.data import DataLoader | ||
from tqdm import tqdm | ||
|
||
from py_utils import ( | ||
get_dataset_builder, get_num_workers, get_raw_datasets, get_tokenizer, | ||
preprocess_datasets | ||
) | ||
|
||
|
||
def main(): | ||
training_epochs = 1 | ||
batch_size = 256 | ||
|
||
# Check that the GPU is available | ||
assert torch.cuda.is_available() and torch.cuda.device_count() > 0 | ||
device = torch.device("cuda", 0) | ||
|
||
# Setup logging (optional, but much better than using print statements) | ||
logging.basicConfig( | ||
level=logging.INFO, | ||
handlers=[rich.logging.RichHandler(markup=True)], # Very pretty, uses the `rich` package. | ||
) | ||
|
||
logger = logging.getLogger(__name__) | ||
|
||
# Setup ImageNet | ||
num_workers = get_num_workers() | ||
train_dataset, valid_dataset, test_dataset = make_datasets(num_workers) | ||
train_dataloader = DataLoader( | ||
train_dataset, | ||
batch_size=batch_size, | ||
num_workers=num_workers, | ||
shuffle=True, | ||
) | ||
valid_dataloader = DataLoader( | ||
valid_dataset, | ||
batch_size=batch_size, | ||
num_workers=num_workers, | ||
shuffle=False, | ||
) | ||
test_dataloader = DataLoader( # NOTE: Not used in this example. | ||
test_dataset, | ||
batch_size=batch_size, | ||
num_workers=num_workers, | ||
shuffle=False, | ||
) | ||
|
||
# Checkout the "checkpointing and preemption" example for more info! | ||
logger.debug("Starting training from scratch.") | ||
|
||
for epoch in range(training_epochs): | ||
logger.debug(f"Starting epoch {epoch}/{training_epochs}") | ||
|
||
# NOTE: using a progress bar from tqdm because it's nicer than using `print`. | ||
progress_bar = tqdm( | ||
total=len(train_dataloader), | ||
desc=f"Train epoch {epoch}", | ||
) | ||
|
||
# Training loop | ||
for batch in train_dataloader: | ||
# Move the batch to the GPU before we pass it to the model | ||
batch = {k:item.to(device) for k, item in batch.items()} | ||
|
||
# [Training of the model goes here] | ||
|
||
# Advance the progress bar one step, and update the "postfix" () the progress bar. (nicer than just) | ||
progress_bar.update(1) | ||
progress_bar.close() | ||
|
||
val_loss, val_accuracy = validation_loop(None, valid_dataloader, device) | ||
logger.info(f"Epoch {epoch}: Val loss: {val_loss:.3f} accuracy: {val_accuracy:.2%}") | ||
|
||
print("Done!") | ||
|
||
|
||
@torch.no_grad() | ||
def validation_loop(model: nn.Module, dataloader: DataLoader, device: torch.device): | ||
total_loss = 0.0 | ||
n_samples = 0 | ||
correct_predictions = 0 | ||
|
||
for batch in dataloader: | ||
batch = {k:item.to(device) for k, item in batch.items()} | ||
|
||
batch_n_samples = batch["input_ids"].data.shape[0] | ||
|
||
n_samples += batch_n_samples | ||
|
||
accuracy = correct_predictions / n_samples | ||
return total_loss, accuracy | ||
|
||
|
||
def make_datasets(num_workers:int=None): | ||
"""Returns the training, validation, and test splits for the prepared dataset. | ||
""" | ||
builder = get_dataset_builder() | ||
raw_datasets = get_raw_datasets(builder) | ||
tokenizer = get_tokenizer() | ||
preprocessed_datasets = preprocess_datasets(tokenizer, raw_datasets, num_workers=num_workers) | ||
return ( | ||
preprocessed_datasets["train"], preprocessed_datasets["validation"], | ||
preprocessed_datasets["test"] | ||
) | ||
|
||
|
||
if __name__ == "__main__": | ||
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,30 @@ | ||
"""Preprocess the dataset. | ||
In this example, HuggingFace is used and the resulting dataset will be stored in | ||
$HF_DATASETS_CACHE. It is preferable to set the datasets cache to a location in | ||
$SCRATCH""" | ||
|
||
from py_utils import ( | ||
get_config, get_dataset_builder, get_num_workers, get_raw_datasets, | ||
get_tokenizer, preprocess_datasets | ||
) | ||
|
||
|
||
if __name__ == "__main__": | ||
import sys | ||
import time | ||
|
||
_LOCAL_DS = sys.argv[1] | ||
try: | ||
_WORKERS = int(sys.argv[2]) | ||
except IndexError: | ||
_WORKERS = get_num_workers() | ||
|
||
t = -time.time() | ||
_ = get_config() | ||
builder = get_dataset_builder(local_dataset=_LOCAL_DS, num_workers=_WORKERS) | ||
raw_datasets = get_raw_datasets(builder) | ||
tokenizer = get_tokenizer() | ||
_ = preprocess_datasets(tokenizer, raw_datasets, num_workers=_WORKERS) | ||
t += time.time() | ||
|
||
print(f"Prepared data in {t/60:.2f}m") |
Oops, something went wrong.