Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
48 commits
Select commit Hold shift + click to select a range
bcd731c
add fill masks tests
jeffra Jun 3, 2022
1915826
add a temporary fix for the Bert-Type models; TODO: Fix this for the …
Jun 3, 2022
965d109
add cuda graph to tests
jeffra Jun 3, 2022
05a9ca7
add fp16/32
jeffra Jun 3, 2022
2acba69
add q&a tests and set hf framework to pt
jeffra Jun 7, 2022
7822770
Merge branch 'master' into jeffra/fill-mask-tests
jeffra Jun 7, 2022
5724f7e
fixed assignment of DS pipeline model
mrwyattii Jun 7, 2022
ef350e9
generalized the model/task testing and added pytest fixtures
mrwyattii Jun 7, 2022
9a934fe
fixed syntax error
mrwyattii Jun 7, 2022
fbc05fe
keyerror with text-gen assert_fn
mrwyattii Jun 7, 2022
fc569e9
added back gptj model
mrwyattii Jun 7, 2022
fbdd6b8
Merge branch 'master' into jeffra/fill-mask-tests
RezaYazdaniAminabadi Jun 8, 2022
9c06f04
added temporary fix to get gpt models passing
mrwyattii Jun 9, 2022
c10acaf
fixing some issue for the Bert-Postln model
Jun 9, 2022
4440aed
Merge branch 'jeffra/fill-mask-tests' of github.com:microsoft/DeepSpe…
Jun 9, 2022
0519a72
formatting
mrwyattii Jun 9, 2022
3175430
added fp16 tests
mrwyattii Jun 9, 2022
9cdd981
fixed bad comparison
mrwyattii Jun 9, 2022
3d93c07
avoid CUDA init when getting GPU memory
mrwyattii Jun 9, 2022
d864d2b
revert to not checking GPU memory
mrwyattii Jun 9, 2022
15ed0a2
extra import
mrwyattii Jun 9, 2022
2c88ab1
refactored validation of test config, add skip for AMD w/CUDA Graph
mrwyattii Jun 10, 2022
0872f9f
cuda graph requires 1.10+
jeffra Jun 10, 2022
1d08254
ensure torch > 1.10 if cuda graph enabled, also fix typo
jeffra Jun 10, 2022
816e746
prune incompatible model+task pairs
jeffra Jun 10, 2022
42841c6
add pretty print names for test fixtures
jeffra Jun 10, 2022
2677d20
use fp16 revision of gpt-j
jeffra Jun 10, 2022
3ec3f9f
convert pipe model to half for non-DS case
jeffra Jun 10, 2022
d37f9d9
Merge branch 'master' into jeffra/fill-mask-tests
jeffra Jun 10, 2022
8b88eef
remove dep on hf-accelerate
jeffra Jun 11, 2022
50cbfce
manual convert gpt-j fp32->fp16, seeing issues with fp16 revision
jeffra Jun 11, 2022
f3e40ae
force inference tests to be seq
jeffra Jun 13, 2022
d3f6dfe
added correctness tests for text-generation gpt models
mrwyattii Jun 13, 2022
71a5840
fixed device string for lm tests
mrwyattii Jun 13, 2022
f99fc0d
during injection reshape data on cpu to avoid extra GPU memory
jeffra Jun 13, 2022
1d21343
increased acceptable ppl diff val
mrwyattii Jun 13, 2022
2777c23
added perf test and fix for gpt-j-6B OOM on V100 16GB
mrwyattii Jun 13, 2022
eea4c1e
remove sequential and move nv-nightly to nightly
jeffra Jun 13, 2022
c0d695c
Merge branch 'master' into jeffra/fill-mask-tests
jeffra Jun 13, 2022
b6b6682
Merge branch 'master' into jeffra/fill-mask-tests
jeffra Jun 15, 2022
c525af2
skip inference tests on rocm for now
jeffra Jun 15, 2022
e4573b2
add missing import
jeffra Jun 15, 2022
f01df0f
make performance asserts less strict
mrwyattii Jun 15, 2022
368427b
fix for datasets error
mrwyattii Jun 15, 2022
3a4e12c
seperate inference tests from regular unit tests
mrwyattii Jun 15, 2022
0ea845d
added new req file for inference workflow
mrwyattii Jun 15, 2022
2043d57
skip test if deps aren't installed
jeffra Jun 15, 2022
21d4beb
fix for inf reqs install
mrwyattii Jun 15, 2022
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/amd.yml
Original file line number Diff line number Diff line change
Expand Up @@ -63,5 +63,5 @@ jobs:
run: |
if [[ -d ./torch-extensions ]]; then rm -rf ./torch-extensions; fi
cd tests
TORCH_EXTENSIONS_DIR=./torch-extensions pytest --color=yes --durations=0 --forked --verbose -x -n 4 -m 'not sequential' unit/
TORCH_EXTENSIONS_DIR=./torch-extensions pytest --color=yes --durations=0 --forked --verbose -x -n 4 unit/
TORCH_EXTENSIONS_DIR=./torch-extensions pytest --color=yes --durations=0 --forked --verbose -x -m 'sequential' unit/
63 changes: 63 additions & 0 deletions .github/workflows/nv-inference.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
name: nv-inference

on:
push:
branches:
- 'master'
- 'staging**'
paths-ignore:
- 'docs/**'
pull_request:
paths-ignore:
- 'docs/**'

concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: true

jobs:
unit-tests:
runs-on: [self-hosted, nvidia, cu111, v100]

steps:
- uses: actions/checkout@v2

- name: environment
run: |
nvidia-smi
which python
python --version
which nvcc
nvcc --version
pip install --upgrade pip
pip uninstall --yes torch torchvision
pip install torch==1.8.2+cu111 torchvision==0.9.2+cu111 -f https://download.pytorch.org/whl/lts/1.8/torch_lts.html
python -c "import torch; print('torch:', torch.__version__, torch)"
python -c "import torch; print('CUDA available:', torch.cuda.is_available())"

- name: Install transformers
run: |
git clone https://github.com/huggingface/transformers
cd transformers
# if needed switch to the last known good SHA until transformers@master is fixed
# git checkout 1cc453d33
git rev-parse --short HEAD
pip uninstall --yes transformers
pip install .

- name: Python environment
run: |
pip list

- name: Install deepspeed
run: |
pip uninstall --yes deepspeed
pip install .[dev,1bit,autotuning,sparse_attn,inf]
ds_report

- name: Unit tests
run: |
unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
if [[ -d ./torch-extensions ]]; then rm -rf ./torch-extensions; fi
cd tests
TORCH_EXTENSIONS_DIR=./torch-extensions pytest --color=yes --durations=0 --forked --verbose -n 4 -m 'inference' unit/
52 changes: 52 additions & 0 deletions .github/workflows/nv-nightly.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
name: nv-nightly

on:
schedule:
- cron: "0 0 * * *"

concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: true

jobs:
unit-tests:
runs-on: [self-hosted, nvidia, cu111, v100]

steps:
- uses: actions/checkout@v2

- name: environment
run: |
nvidia-smi
which python
python --version
which nvcc
nvcc --version
pip install --upgrade pip
pip uninstall --yes torch torchvision
pip install torch==1.8.2+cu111 torchvision==0.9.2+cu111 -f https://download.pytorch.org/whl/lts/1.8/torch_lts.html
python -c "import torch; print('torch:', torch.__version__, torch)"
python -c "import torch; print('CUDA available:', torch.cuda.is_available())"

- name: Install transformers
run: |
git clone https://github.com/huggingface/transformers
cd transformers
# if needed switch to the last known good SHA until transformers@master is fixed
# git checkout 1cc453d33
git rev-parse --short HEAD
pip uninstall --yes transformers
pip install .

- name: Install deepspeed
run: |
pip uninstall --yes deepspeed
pip install .[dev,1bit,autotuning,sparse_attn]
ds_report

- name: Unit tests
run: |
unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
if [[ -d ./torch-extensions ]]; then rm -rf ./torch-extensions; fi
cd tests
TORCH_EXTENSIONS_DIR=./torch-extensions pytest --color=yes --durations=0 --forked --verbose -m 'nightly' unit/
2 changes: 1 addition & 1 deletion .github/workflows/nv-torch-latest-v100.yml
Original file line number Diff line number Diff line change
Expand Up @@ -60,5 +60,5 @@ jobs:
unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
if [[ -d ./torch-extensions ]]; then rm -rf ./torch-extensions; fi
cd tests
TORCH_EXTENSIONS_DIR=./torch-extensions pytest --color=yes --durations=0 --forked --verbose -n 4 -m 'not sequential' unit/
TORCH_EXTENSIONS_DIR=./torch-extensions pytest --color=yes --durations=0 --forked --verbose -n 4 unit/
TORCH_EXTENSIONS_DIR=./torch-extensions pytest --color=yes --durations=0 --forked --verbose -m 'sequential' unit/
2 changes: 1 addition & 1 deletion .github/workflows/nv-torch-nightly-v100.yml
Original file line number Diff line number Diff line change
Expand Up @@ -53,5 +53,5 @@ jobs:
unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
if [[ -d ./torch-extensions ]]; then rm -rf ./torch-extensions; fi
cd tests
TORCH_EXTENSIONS_DIR=./torch-extensions pytest --color=yes --durations=0 --forked --verbose -n 4 -m 'not sequential' unit/
TORCH_EXTENSIONS_DIR=./torch-extensions pytest --color=yes --durations=0 --forked --verbose -n 4 unit/
TORCH_EXTENSIONS_DIR=./torch-extensions pytest --color=yes --durations=0 --forked --verbose -m 'sequential' unit/
2 changes: 1 addition & 1 deletion .github/workflows/nv-torch18-v100.yml
Original file line number Diff line number Diff line change
Expand Up @@ -60,5 +60,5 @@ jobs:
unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
if [[ -d ./torch-extensions ]]; then rm -rf ./torch-extensions; fi
cd tests
TORCH_EXTENSIONS_DIR=./torch-extensions pytest --color=yes --durations=0 --forked --verbose -n 4 -m 'not sequential' unit/
TORCH_EXTENSIONS_DIR=./torch-extensions pytest --color=yes --durations=0 --forked --verbose -n 4 unit/
TORCH_EXTENSIONS_DIR=./torch-extensions pytest --color=yes --durations=0 --forked --verbose -m 'sequential' unit/
2 changes: 2 additions & 0 deletions .github/workflows/nv-transformers-v100.yml
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,8 @@ jobs:
pip install .[testing]
# find reqs used in ds integration tests
find examples/pytorch -regextype posix-egrep -regex '.*(language-modeling|question-answering|summarization|image-classification|text-classification|translation).*/requirements.txt' -exec grep -v 'torch' {} \; | xargs -I {} pip install --upgrade {}
# force datasets version due to issues
pip install datasets==2.2.2
# force protobuf version due to issues
pip install "protobuf<4.21.0"
pip list
Expand Down
1 change: 1 addition & 0 deletions bin/dsr
65 changes: 36 additions & 29 deletions csrc/transformer/inference/csrc/gelu.cu
Original file line number Diff line number Diff line change
Expand Up @@ -174,7 +174,8 @@ __global__ void fused_bias_residual(float* input,
float* attnbias,
int total_count,
int intermediate_size,
int mp_size)
int mp_size,
bool preln)
{
float4* input_cast = reinterpret_cast<float4*>(input);
float4* output_cast = reinterpret_cast<float4*>(output);
Expand All @@ -189,12 +190,17 @@ __global__ void fused_bias_residual(float* input,
float4 res_vec = attn_cast[offset];
float4 bias_data = bias_cast[offset % intermediate_size];
float4 attn_bias = attnbias_cast[offset % intermediate_size];

data.x = (data.x + res_vec.x) * mp_size + (out.x + bias_data.x + attn_bias.x);
data.y = (data.y + res_vec.y) * mp_size + (out.y + bias_data.y + attn_bias.y);
data.z = (data.z + res_vec.z) * mp_size + (out.z + bias_data.z + attn_bias.z);
data.w = (data.w + res_vec.w) * mp_size + (out.w + bias_data.w + attn_bias.w);

if (preln) {
data.x = (data.x + res_vec.x) * mp_size + (out.x + bias_data.x + attn_bias.x);
data.y = (data.y + res_vec.y) * mp_size + (out.y + bias_data.y + attn_bias.y);
data.z = (data.z + res_vec.z) * mp_size + (out.z + bias_data.z + attn_bias.z);
data.w = (data.w + res_vec.w) * mp_size + (out.w + bias_data.w + attn_bias.w);
} else {
data.x = data.x + out.x + bias_data.x;
data.y = data.y + out.y + bias_data.y;
data.z = data.z + out.z + bias_data.z;
data.w = data.w + out.w + bias_data.w;
}
output_cast[offset] = data;
}
}
Expand All @@ -206,7 +212,8 @@ __global__ void fused_bias_residual(__half* input,
__half* attn_bias,
int total_count,
int intermediate_size,
int mp_size)
int mp_size,
bool preln)
{
#ifdef HALF_PRECISION_AVAILABLE

Expand Down Expand Up @@ -248,15 +255,21 @@ __global__ void fused_bias_residual(__half* input,
float2 attn_low_bias = __half22float2(attnbias_half[0]);
float2 attn_high_bias = __half22float2(attnbias_half[1]);

low_data.x =
(low_data.x + low_res.x) * mp_size + (low_out.x + (low_bias.x + attn_low_bias.x));
low_data.y =
(low_data.y + low_res.y) * mp_size + (low_out.y + (low_bias.y + attn_low_bias.y));
high_data.x =
(high_data.x + high_res.x) * mp_size + (high_out.x + (high_bias.x + attn_high_bias.x));
high_data.y =
(high_data.y + high_res.y) * mp_size + (high_out.y + (high_bias.y + attn_high_bias.y));

if (preln) {
low_data.x =
(low_data.x + low_res.x) * mp_size + (low_out.x + (low_bias.x + attn_low_bias.x));
low_data.y =
(low_data.y + low_res.y) * mp_size + (low_out.y + (low_bias.y + attn_low_bias.y));
high_data.x = (high_data.x + high_res.x) * mp_size +
(high_out.x + (high_bias.x + attn_high_bias.x));
high_data.y = (high_data.y + high_res.y) * mp_size +
(high_out.y + (high_bias.y + attn_high_bias.y));
} else {
low_data.x = (low_data.x + low_out.x + low_bias.x);
low_data.y = (low_data.y + low_out.y + low_bias.y);
high_data.x = (high_data.x + high_out.x + high_bias.x);
high_data.y = (high_data.y + high_out.y + high_bias.y);
}
vals_half[0] = __float22half2_rn(low_data);
vals_half[1] = __float22half2_rn(high_data);

Expand All @@ -274,27 +287,21 @@ void launch_bias_residual(T* input,
int batch,
int hidden_dim,
int mp_size,
bool preln,
cudaStream_t stream)
{
int total_count = batch * hidden_dim / 4;
dim3 block_dims(1024);
dim3 grid_dims((total_count - 1) / 1024 + 1); // (batch_size);

fused_bias_residual<<<grid_dims, block_dims, 0, stream>>>(
input, output, attn, bias, attn_bias, total_count, hidden_dim / 4, 1.0 / mp_size);
input, output, attn, bias, attn_bias, total_count, hidden_dim / 4, 1.0 / mp_size, preln);
}

template void
launch_bias_residual<float>(float*, float*, float*, float*, float*, int, int, int, cudaStream_t);
template void launch_bias_residual<__half>(__half*,
__half*,
__half*,
__half*,
__half*,
int,
int,
int,
cudaStream_t);
template void launch_bias_residual<
float>(float*, float*, float*, float*, float*, int, int, int, bool, cudaStream_t);
template void launch_bias_residual<
__half>(__half*, __half*, __half*, __half*, __half*, int, int, int, bool, cudaStream_t);

__global__ void gptj_residual_add(float* input,
float* output,
Expand Down
74 changes: 39 additions & 35 deletions csrc/transformer/inference/csrc/pt_binding.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -787,17 +787,17 @@ at::Tensor ds_vector_matmul_int8(at::Tensor& input,
}

template <typename T>
void mlp_unfused_cublas(at::Tensor& output,
at::Tensor& input,
at::Tensor& residual,
at::Tensor& input_bias,
at::Tensor& weight,
at::Tensor& bias,
at::Tensor& gamma,
at::Tensor& beta,
const float epsilon,
bool preLayerNorm,
bool mlp_after_attn)
at::Tensor mlp_unfused_cublas(at::Tensor& output,
at::Tensor& input,
at::Tensor& residual,
at::Tensor& input_bias,
at::Tensor& weight,
at::Tensor& bias,
at::Tensor& gamma,
at::Tensor& beta,
const float epsilon,
bool preLayerNorm,
bool mlp_after_attn)
{
int bsz = input.size(0) * input.size(1);
auto inp_norm = at::empty_like(input);
Expand Down Expand Up @@ -840,18 +840,19 @@ void mlp_unfused_cublas(at::Tensor& output,
weight.size(1),
bsz,
Context::Instance().GetCurrentStream());
return inp_norm;
}
template <typename T>
at::Tensor ds_mlp_gemm(at::Tensor& input,
at::Tensor& residual,
at::Tensor& input_bias,
at::Tensor& weight,
at::Tensor& bias,
at::Tensor& gamma,
at::Tensor& beta,
const float epsilon,
bool preLayerNorm,
bool mlp_after_attn)
std::vector<at::Tensor> ds_mlp_gemm(at::Tensor& input,
at::Tensor& residual,
at::Tensor& input_bias,
at::Tensor& weight,
at::Tensor& bias,
at::Tensor& gamma,
at::Tensor& beta,
const float epsilon,
bool preLayerNorm,
bool mlp_after_attn)
{
auto input_cont = input.contiguous();
auto options = at::TensorOptions()
Expand All @@ -863,19 +864,19 @@ at::Tensor ds_mlp_gemm(at::Tensor& input,
auto output = at::empty({input_cont.size(0), input_cont.size(1), weight.size(1)}, options);
int bsz = input_cont.size(0) * input_cont.size(1);

mlp_unfused_cublas<T>(output,
mlp_after_attn ? input : residual,
residual,
input_bias,
weight,
bias,
gamma,
beta,
epsilon,
preLayerNorm,
mlp_after_attn);

return output;
auto res_add = mlp_unfused_cublas<T>(output,
mlp_after_attn ? input : residual,
residual,
input_bias,
weight,
bias,
gamma,
beta,
epsilon,
preLayerNorm,
mlp_after_attn);

return {output, res_add};
}

template <typename T>
Expand Down Expand Up @@ -1001,7 +1002,8 @@ void residual_add_bias(at::Tensor& output,
at::Tensor& attention_b,
int mp_size,
bool mlp_after_attn,
bool add_bias)
bool add_bias,
bool preln)
{
int bsz = input.size(0) * input.size(1);
int hidden_size = input.size(2);
Expand All @@ -1017,6 +1019,7 @@ void residual_add_bias(at::Tensor& output,
bsz,
hidden_size,
mp_size,
preln,
Context::Instance().GetCurrentStream());
else
launch_gptj_residual_add<float>((float*)input.data_ptr(),
Expand All @@ -1037,6 +1040,7 @@ void residual_add_bias(at::Tensor& output,
bsz,
hidden_size,
mp_size,
preln,
Context::Instance().GetCurrentStream());
else
launch_gptj_residual_add<__half>((__half*)input.data_ptr(),
Expand Down
1 change: 1 addition & 0 deletions csrc/transformer/inference/includes/custom_cuda_layers.h
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@ void launch_bias_residual(T* input,
int batch,
int hidden_dim,
int mp_size,
bool preln,
cudaStream_t stream);

template <typename T>
Expand Down
Loading