Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
79 commits
Select commit Hold shift + click to select a range
ae1625b
Set default value of target_modules to be None in LoraConfig
willmj Jul 29, 2024
2b4ce8d
Removal of transformers logger and addition of python logger
Abhishek-TAMU Jul 30, 2024
76fffc5
FMT and lint check: Removal of transformers logger and addition of py…
Abhishek-TAMU Jul 30, 2024
773f0d1
fix: remove lm_head for granite with llama arch models (#258)
Ssukriti Jul 29, 2024
2e36147
Merge branch 'main' into fix_logging
Abhishek-TAMU Jul 30, 2024
a8bd8dd
Add config_utils tests
aluu317 Jul 24, 2024
7054070
Fix fmt
aluu317 Jul 24, 2024
906ce02
Separate tests out and use docstrings
aluu317 Jul 30, 2024
3e831c6
Update more field/value checks from HF defaults
aluu317 Jul 30, 2024
8846bc2
Fix: Addition of env var TRANSFORMERS_VERBOSITY check
Abhishek-TAMU Jul 30, 2024
4ed3878
FMT Fix: Addition of env var TRANSFORMERS_VERBOSITY check
Abhishek-TAMU Jul 30, 2024
dc32eda
Add test for tokenizer in lora config (should be ignored)
aluu317 Jul 30, 2024
efb8363
Adding logging support to accelerate launch
Abhishek-TAMU Jul 30, 2024
2ecfaf7
FMT_FIX: Adding logging support to accelerate launch
Abhishek-TAMU Jul 30, 2024
a9b8ec8
Merge pull request #262 from aluu317/test_config_utils
aluu317 Jul 30, 2024
f2e8afb
Merge branch 'main' into fix_logging
Abhishek-TAMU Jul 30, 2024
f7e7e23
bug: On save event added to callback (#256)
seshapad Jul 31, 2024
71c3e8a
feat: All metric handling changes (#263)
seshapad Jul 31, 2024
72471ae
feat: Configuration to set logging level for trigger log (#241)
seshapad Jul 31, 2024
0bf6871
Merge branch 'main' into 1143
willmj Jul 31, 2024
c44b1dc
Merge branch 'main' into fix_logging
Abhishek-TAMU Jul 31, 2024
f57ff63
limit peft deps until investigate (#274)
anhuong Jul 31, 2024
3439a68
Data custom collator (#260)
Ssukriti Jul 31, 2024
c7589d1
Revert "limit peft deps until investigate (#274)" (#275)
anhuong Aug 1, 2024
1884f61
feat: per process state metric (#239)
HarikrishnanBalagopal Aug 1, 2024
a09f837
Modify test to pass with target_modules: None
willmj Aug 1, 2024
c15f4f9
Merge branch 'main' into 1143
willmj Aug 1, 2024
abd8abc
Merge branch 'foundation-model-stack:main' into fix_logging
Abhishek-TAMU Aug 1, 2024
5d08efb
Logging changes and unit tests added
Abhishek-TAMU Aug 1, 2024
003feb5
feat: Add a dockerfile argument to enable aimstack (#261)
dushyantbehl Aug 1, 2024
a4cc9c2
Merge branch 'foundation-model-stack:main' into fix_logging
Abhishek-TAMU Aug 1, 2024
7fffda7
Solved conflict with main
Abhishek-TAMU Aug 1, 2024
ba8a972
FMT:Fix Solved conflict with main
Abhishek-TAMU Aug 1, 2024
f1159f9
enabling tests for prompt tuning
Abhishek-TAMU Aug 1, 2024
f757c9c
Merge branch 'main' into 1143
willmj Aug 1, 2024
59cc20b
Merge pull request #269 from willmj/1143
willmj Aug 1, 2024
612789d
feat: Support pretokenized (#272)
kmehant Aug 1, 2024
97cb42f
Update packaging requirement from <24,>=23.2 to >=23.2,<25 (#212)
dependabot[bot] Aug 1, 2024
756d097
Merge branch 'main' into main
anhuong Aug 1, 2024
d35a139
enabling tests for prompt tuning (#278)
Abhishek-TAMU Aug 2, 2024
e0da345
fix: do not add special tokens for custom tokenizer (#279)
kmehant Aug 5, 2024
bf30ed2
Merge remote-tracking branch 'upstream/main'
Abhishek-TAMU Aug 5, 2024
cb846ca
Merge branch 'main' of github.com:Abhishek-TAMU/fms-hf-tuning
Abhishek-TAMU Aug 5, 2024
da7acc6
merge with main
Abhishek-TAMU Aug 5, 2024
8af5792
PR changes for changing logger
Abhishek-TAMU Aug 5, 2024
5853949
fix: bug where the logger was not being used properly (#286)
HarikrishnanBalagopal Aug 5, 2024
697056c
Merge remote-tracking branch 'upstream/main'
Abhishek-TAMU Aug 5, 2024
768d93a
Merge branch 'main' into fix_logging
Abhishek-TAMU Aug 5, 2024
ba489b5
Unit Tests changes
Abhishek-TAMU Aug 5, 2024
d728007
Add functionality to free disk space from Github Actions (#287)
willmj Aug 5, 2024
dc9a521
commented os.environ[LOG_LEVEL] in accelerate.py for testing
Abhishek-TAMU Aug 6, 2024
f3a984a
Merge remote-tracking branch 'upstream/main'
Abhishek-TAMU Aug 6, 2024
024b12e
Merge branch 'main' into fix_logging
Abhishek-TAMU Aug 6, 2024
cfeb709
PR changes
Abhishek-TAMU Aug 6, 2024
bf36b36
FIX:FMT
Abhishek-TAMU Aug 6, 2024
fe4b6d5
PR Changes
Abhishek-TAMU Aug 6, 2024
c544f47
PR Changes
Abhishek-TAMU Aug 6, 2024
06614b6
Add unit test to verify target_modules defaults correctly (#281)
willmj Aug 6, 2024
d224be6
docs: Add documentation on experiment tracking. (#257)
dushyantbehl Aug 7, 2024
baeecf1
Ensure additional metadata to trackers don't throw error in happy cas…
dushyantbehl Aug 7, 2024
f0fcfdb
Merge remote-tracking branch 'upstream/main'
Abhishek-TAMU Aug 7, 2024
e65cb2d
Merge branch 'main' into fix_logging
Abhishek-TAMU Aug 7, 2024
4841119
PR Changes
Abhishek-TAMU Aug 7, 2024
0a9d4a6
fix multiple runid creation bug with accelerate. (#268)
dushyantbehl Aug 8, 2024
ce965a8
feat: logging control operation (#264)
seshapad Aug 8, 2024
fe8bb05
Merge remote-tracking branch 'upstream/main'
Abhishek-TAMU Aug 8, 2024
5ecf4dd
Metrics file epoch indexing from 0
Abhishek-TAMU Aug 8, 2024
89124ac
Revert last commit
Abhishek-TAMU Aug 8, 2024
bb0caf9
fix run evaluation to get base model path (#273)
anhuong Aug 8, 2024
bddad06
Merge remote-tracking branch 'upstream/main'
Abhishek-TAMU Aug 8, 2024
0777460
Merge branch 'main' into fix_logging
Abhishek-TAMU Aug 8, 2024
3068f51
PR Changes
Abhishek-TAMU Aug 8, 2024
0866bce
PR Changes
Abhishek-TAMU Aug 9, 2024
ee25de4
Merge pull request #270 from Abhishek-TAMU/fix_logging
Abhishek-TAMU Aug 9, 2024
a40e138
feat: Added additional events such as on_step_begin, on_optimizer_ste…
seshapad Aug 12, 2024
6e7cea6
Always update setuptools to latest (#288)
jbusche Aug 12, 2024
0aae2aa
Rename all fixtures with correct .jsonl extension (#295)
willmj Aug 12, 2024
78909af
feat: add save_model_dir flag where final checkpoint saved (#291)
anhuong Aug 14, 2024
6bcdf59
Merge tag 'v1.2.0-rc.1' into v1.2.0-rc1
willmj Aug 14, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 9 additions & 1 deletion .github/workflows/image.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,14 @@ jobs:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- name: Free disk space
run: |
sudo swapoff -a
sudo rm -f /swapfile
sudo apt clean
docker rmi $(docker image ls -aq)
df -h
- name: Build image
run: |
docker build -t fms-hf-tuning:dev . -f build/Dockerfile
docker build -t fms-hf-tuning:dev . -f build/Dockerfile

100 changes: 91 additions & 9 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
- [Training](#training)
- [Single GPU](#single-gpu)
- [Multiple GPUs with FSDP](#multiple-gpus-with-fsdp)
- [Tips on Parameters to Set](#tips-on-parameters-to-set)
- [Tuning Techniques](#tuning-techniques)
- [LoRA Tuning Example](#lora-tuning-example)
- [Prompt Tuning](#prompt-tuning)
Expand All @@ -18,6 +19,7 @@
- [Changing the Base Model for Inference](#changing-the-base-model-for-inference)
- [Validation](#validation)
- [Trainer Controller Framework](#trainer-controller-framework)
- [Experiment Tracking](#experiment-tracking)
- [More Examples](#more-examples)

This repo provides basic tuning scripts with support for specific models. The repo relies on Hugging Face `SFTTrainer` and PyTorch FSDP. Our approach to tuning is:
Expand All @@ -27,32 +29,43 @@ This repo provides basic tuning scripts with support for specific models. The re

## Installation

### Basic Installation

```
pip install fms-hf-tuning
```

### Using FlashAttention

> Note: After installing, if you wish to use [FlashAttention](https://github.com/Dao-AILab/flash-attention), then you need to install these requirements:
```
pip install fms-hf-tuning[dev]
pip install fms-hf-tuning[flash-attn]
```
[FlashAttention](https://github.com/Dao-AILab/flash-attention) requires the [CUDA Toolit](https://developer.nvidia.com/cuda-toolkit) to be pre-installed.

If you wish to use [aim](https://github.com/aimhubio/aim), then you need to install it:
```
pip install fms-hf-tuning[aim]
```
### Using FMS-Acceleration

If you wish to use [fms-acceleration](https://github.com/foundation-model-stack/fms-acceleration), you need to install it.
```
pip install fms-hf-tuning[fms-accel]
```
`fms-acceleration` is a collection of plugins that packages that accelerate fine-tuning / training of large models, as part of the `fms-hf-tuning` suite. For more details on see [this section below](#fms-acceleration).
`fms-acceleration` is a collection of plugins that packages that accelerate fine-tuning / training of large models, as part of the `fms-hf-tuning` suite. For more details see [this section below](#fms-acceleration).

### Using Experiment Trackers

To use experiment tracking with popular tools like [Aim](https://github.com/aimhubio/aim), note that some trackers are considered optional dependencies and can be installed with the following command:
```
pip install fms-hf-tuning[aim]
```
For more details on how to enable and use the trackers, Please see, [the experiment tracking section below](#experiment-tracking).

## Data format
We support two data formats:
We support the following data formats:

### 1. JSON formats with a single sequence and a specified response_template to use for masking on completion.

1. #### Pre-process the JSON/JSONL dataset
#### 1.1 Pre-process the JSON/JSONL dataset
Pre-process the JSON/JSONL dataset to contain a single sequence of each data instance containing input + Response. The trainer is configured to expect a response template as a string. For example, if one wants to prepare the `alpaca` format data to feed into this trainer, it is quite easy and can be done with the following code.

```python
Expand Down Expand Up @@ -87,7 +100,7 @@ The same way can be applied to any dataset, with more info can be found [here](h

Once the JSON is converted using the formatting function, pass the `dataset_text_field` containing the single sequence to the trainer.

2. #### Format JSON/JSONL on the fly
#### 1.2 Format JSON/JSONL on the fly
Pass a JSON/JSONL and a `data_formatter_template` to use the formatting function on the fly while tuning. The template should specify fields of JSON with `{{field}}`. While tuning, the data will be converted to a single sequence using the template.
JSON fields can contain alpha-numeric characters, spaces and the following special symbols - "." , "_", "-".

Expand All @@ -101,8 +114,20 @@ data_formatter_template: `### Input: {{input}} \n\n##Label: {{output}}`

Formatting will happen on the fly while tuning. The keys in template should match fields in JSON file. The `response template` corresponding to the above template will need to be supplied. in this case, `response template` = `\n## Label:`.

##### In conclusion, if using the reponse_template and single sequence, either the `data_formatter_template` argument or `dataset_text_field` needs to be supplied to the trainer.

##### In conclusion, either the `data_formatter_template` argument or `dataset_text_field` needs to be supplied to the trainer.
### 2. JSONL with input and output fields (no response template)

Pass a JSONL containing fields "input" with source text and "output" with class labels. Pre-format the input as you see fit. The output field will simply be concatenated to the end of input to create single sequence, and input will be masked.

The "input" and "output" field names are mandatory and cannot be changed.

Example: Train.jsonl

```
{"input": "### Input: Colorado is a state in USA ### Output:", "output": "USA : Location"}
{"input": "### Input: Arizona is also a state in USA ### Output:", "output": "USA : Location"}
```

## Supported Models

Expand Down Expand Up @@ -201,6 +226,50 @@ tuning/sft_trainer.py \

To summarize you can pick either python for single-GPU jobs or use accelerate launch for multi-GPU jobs. The following tuning techniques can be applied:

### Tips on Parameters to Set

#### Saving checkpoints while training

By default, [`save_strategy`](tuning/config/configs.py) is set to `"epoch"` in the TrainingArguments. This means that checkpoints will be saved on each epoch. This can also be set to `"steps"` to save on every `"save_steps"` or `"no"` to not save any checkpoints.

Checkpoints are saved to the given `output_dir`, which is a required field. If `save_strategy="no"`, the `output_dir` will only contain the training logs with loss details.

A useful flag to set to limit the number of checkpoints saved is [`save_total_limit`](https://huggingface.co/docs/transformers/main_classes/trainer#transformers.TrainingArguments.save_total_limit). Older checkpoints are deleted from the `output_dir` to limit the number of checkpoints, for example, if `save_total_limit=1`, this will only save the last checkpoint. However, while tuning, two checkpoints will exist in `output_dir` for a short time as the new checkpoint is created and then the older one will be deleted. If the user sets a validation dataset and [`load_best_model_at_end`](https://huggingface.co/docs/transformers/en/main_classes/trainer#transformers.TrainingArguments.load_best_model_at_end), then the best checkpoint will be saved.

#### Saving model after training

`save_model_dir` can optionally be set to save the tuned model using `SFTTrainer.save_model()`. This can be used in tandem with `save_strategy="no"` to only save the designated checkpoint and not any intermediate checkpoints, which can help to save space.

`save_model_dir` can be set to a different directory than `output_dir`. If set to the same directory, the designated checkpoint, training logs, and any intermediate checkpoints will all be saved to the same directory as seen below.

<details>
<summary>Ways you can use `save_model_dir` and more tips:</summary>

For example, if `save_model_dir` is set to a sub-directory of `output_dir`and `save_total_limit=1` with LoRA tuning, the directory would look like:

```sh
$ ls /tmp/output_dir/
checkpoint-35 save_model_dir training_logs.jsonl

$ ls /tmp/output_dir/save_model_dir/
README.md adapter_model.safetensors special_tokens_map.json tokenizer.model training_args.bin
adapter_config.json added_tokens.json tokenizer.json tokenizer_config.json
```

Here is an fine tuning example of how the directory would look if `output_dir` is set to the same value as `save_model_dir` and `save_total_limit=2`. Note the checkpoint directories as well as the `training_logs.jsonl`:

```sh
$ ls /tmp/same_dir

added_tokens.json model-00001-of-00006.safetensors model-00006-of-00006.safetensors tokenizer_config.json
checkpoint-16 model-00002-of-00006.safetensors model.safetensors.index.json training_args.bin
checkpoint-20 model-00003-of-00006.safetensors special_tokens_map.json training_logs.jsonl
config.json model-00004-of-00006.safetensors tokenizer.json
generation_config.json model-00005-of-00006.safetensors tokenizer.model
```

</details>

## Tuning Techniques:

### LoRA Tuning Example
Expand Down Expand Up @@ -549,6 +618,19 @@ This framework helps users define rules to capture scenarios like criteria for s

For details about how you can use set a custom stopping criteria and perform custom operations, see [examples/trainercontroller_configs/Readme.md](examples/trainercontroller_configs/Readme.md)


## Experiment Tracking

Experiment tracking in fms-hf-tuning allows users to track their experiments with known trackers like [Aimstack](https://aimstack.io/) or custom trackers built into the code like
[FileLoggingTracker](./tuning/trackers/filelogging_tracker.py)

The code supports currently two trackers out of the box,
* `FileLoggingTracker` : A built in tracker which supports logging training loss to a file.
* `Aimstack` : A popular opensource tracker which can be used to track any metrics or metadata from the experiments.

Further details on enabling and using the trackers mentioned above can be found [here](docs/experiment-tracking.md).


## More Examples

[Prompt Tuning on Twitter Complaints](examples/prompt_tuning_twitter_complaints/README.md)
Expand Down
21 changes: 15 additions & 6 deletions build/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ RUN dnf remove -y --disableplugin=subscription-manager \
&& ln -s /usr/bin/python${PYTHON_VERSION} /bin/python \
&& python -m ensurepip --upgrade \
&& python -m pip install --upgrade pip \
&& python -m pip install --upgrade setuptools \
&& dnf update -y \
&& dnf clean all

Expand Down Expand Up @@ -104,6 +105,9 @@ ARG WHEEL_VERSION
ARG USER
ARG USER_UID

## Enable Aimstack if requested via ENABLE_AIM set to "true"
ARG ENABLE_AIM=false

RUN dnf install -y git && \
# perl-Net-SSLeay.x86_64 and server_key.pem are installed with git as dependencies
# Twistlock detects it as H severity: Private keys stored in image
Expand All @@ -129,6 +133,9 @@ RUN --mount=type=cache,target=/home/${USER}/.cache/pip,uid=${USER_UID} \
python -m pip install --user wheel && \
python -m pip install --user "$(head bdist_name)" && \
python -m pip install --user "$(head bdist_name)[flash-attn]" && \
if [[ "${ENABLE_AIM}" == "true" ]]; then \
python -m pip install --user "$(head bdist_name)[aim]"; \
fi && \
# Clean up the wheel module. It's only needed by flash-attn install
python -m pip uninstall wheel build -y && \
# Cleanup the bdist whl file
Expand All @@ -146,6 +153,14 @@ RUN mkdir /app && \
chown -R $USER:0 /app /tmp && \
chmod -R g+rwX /app /tmp

# Need a better way to address these hacks
RUN if [[ "${ENABLE_AIM}" == "true" ]] ; then \
touch /.aim_profile && \
chmod -R 777 /.aim_profile; \
fi
RUN mkdir /.cache && \
chmod -R 777 /.cache

# Copy scripts and default configs
COPY build/accelerate_launch.py fixtures/accelerate_fsdp_defaults.yaml /app/
COPY build/utils.py /app/build/
Expand All @@ -154,12 +169,6 @@ RUN chmod +x /app/accelerate_launch.py
ENV FSDP_DEFAULTS_FILE_PATH="/app/accelerate_fsdp_defaults.yaml"
ENV SET_NUM_PROCESSES_TO_NUM_GPUS="True"

# Need a better way to address this hack
RUN touch /.aim_profile && \
chmod -R 777 /.aim_profile && \
mkdir /.cache && \
chmod -R 777 /.cache

WORKDIR /app
USER ${USER}
COPY --from=python-installations /home/${USER}/.local /home/${USER}/.local
Expand Down
Loading