Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .github/workflows/amd.yml
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ jobs:
# Runs a single command using the runners shell
- name: environment
run: |
echo "JobID: $AISC_NODE_INSTANCE_ID"
rocm-smi --showhw
which python
python --version
Expand Down
1 change: 1 addition & 0 deletions .github/workflows/nv-accelerate-v100.yml
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ jobs:

- name: environment
run: |
echo "JobID: $AISC_NODE_INSTANCE_ID"
nvidia-smi
which python
python --version
Expand Down
4 changes: 3 additions & 1 deletion .github/workflows/nv-inference.yml
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ jobs:

- name: environment
run: |
echo "JobID: $AISC_NODE_INSTANCE_ID"
nvidia-smi
which python
python --version
Expand All @@ -40,7 +41,7 @@ jobs:
git clone https://github.com/huggingface/transformers
cd transformers
# if needed switch to the last known good SHA until transformers@master is fixed
# git checkout 1cc453d33
git checkout v4.21.2
git rev-parse --short HEAD
pip uninstall --yes transformers
pip install .
Expand All @@ -61,4 +62,5 @@ jobs:
if [[ -d ./torch-extensions ]]; then rm -rf ./torch-extensions; fi
cd tests
EXPECTED_TORCH=$(pip index versions torch | grep -oP -m1 "^\s*LATEST.*\s\K\d+\.\d+")
TRANSFORMERS_CACHE=/blob/transformers_cache/ TORCH_EXTENSIONS_DIR=./torch-extensions pytest --color=yes --durations=0 --verbose -m 'seq_inference' unit/ --torch_ver=$EXPECTED_TORCH --cuda_ver="11.3"
TRANSFORMERS_CACHE=/blob/transformers_cache/ TORCH_EXTENSIONS_DIR=./torch-extensions pytest --color=yes --durations=0 -n 4 --verbose -m 'inference' unit/ --torch_ver=$EXPECTED_TORCH --cuda_ver="11.3"
1 change: 1 addition & 0 deletions .github/workflows/nv-lightning-v100.yml
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ jobs:

- name: environment
run: |
echo "JobID: $AISC_NODE_INSTANCE_ID"
nvidia-smi
which python
python --version
Expand Down
1 change: 1 addition & 0 deletions .github/workflows/nv-nightly.yml
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ jobs:

- name: environment
run: |
echo "JobID: $AISC_NODE_INSTANCE_ID"
nvidia-smi
which python
python --version
Expand Down
1 change: 1 addition & 0 deletions .github/workflows/nv-torch-latest-v100.yml
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ jobs:

- name: environment
run: |
echo "JobID: $AISC_NODE_INSTANCE_ID"
nvidia-smi
which python
python --version
Expand Down
1 change: 1 addition & 0 deletions .github/workflows/nv-torch-nightly-v100.yml
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ jobs:

- name: environment
run: |
echo "JobID: $AISC_NODE_INSTANCE_ID"
nvidia-smi
which python
python --version
Expand Down
1 change: 1 addition & 0 deletions .github/workflows/nv-torch12-p40.yml
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ jobs:

- name: environment
run: |
echo "JobID: $AISC_NODE_INSTANCE_ID"
nvidia-smi
which python
python --version
Expand Down
1 change: 1 addition & 0 deletions .github/workflows/nv-torch18-v100.yml
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ jobs:

- name: environment
run: |
echo "JobID: $AISC_NODE_INSTANCE_ID"
nvidia-smi
which python
python --version
Expand Down
1 change: 1 addition & 0 deletions .github/workflows/nv-transformers-v100.yml
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ jobs:

- name: environment
run: |
echo "JobID: $AISC_NODE_INSTANCE_ID"
nvidia-smi
which python
python --version
Expand Down
3 changes: 2 additions & 1 deletion tests/pytest.ini
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
[pytest]
addopts = -m "not sequential and not nightly and not inference"
addopts = -m "not sequential and not nightly and not inference and not seq_inference"
markers =
sequential:Tests that need to be run sequentially
inference:Inference model tests
seq_inference:Inference model tests to run sequentially
nightly:Tests that should be run nightly
78 changes: 71 additions & 7 deletions tests/unit/inference/test_inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ def lm_eval_imports():
"distilgpt2",
"Norod78/hebrew-bad_wiki-gpt_neo-tiny",
"EleutherAI/gpt-j-6B",
"bigscience/bloom-350m",
"bigscience/bloom-560m",
]
_opt_models = [
"facebook/opt-125m", # 125m, 1.7B, ..., 175B variants have the same model architecture.
Expand Down Expand Up @@ -111,6 +111,7 @@ def enable_cuda_graph(request):
@pytest.fixture()
def invalid_model_task_config(model_w_task, dtype, enable_cuda_graph):
model, task = model_w_task
msg = ""
if pkg_version.parse(torch.__version__) <= pkg_version.parse("1.2"):
msg = "DS inference injection doesn't work well on older torch versions"
elif model not in pytest.all_models[task]:
Expand All @@ -120,10 +121,17 @@ def invalid_model_task_config(model_w_task, dtype, enable_cuda_graph):
elif enable_cuda_graph and pkg_version.parse(
torch.__version__) < pkg_version.parse("1.10"):
msg = "CUDA Graph is only available in torch versions >= 1.10"
elif ("gpt-j-6B" in model) and (dtype == torch.float):
elif "gpt-j-6B" in model:
if dtype != torch.half:
msg = f"Not enough GPU memory to run {model} with dtype {dtype}"
elif enable_cuda_graph:
msg = f"Not enough GPU memory to run {model} with CUDA Graph enabled"
elif "gpt-neox-20b" in model: # TODO: remove this when neox issues resolved
msg = "Skipping gpt-neox-20b for now"
elif ("gpt-neox-20b" in model) and (dtype != torch.half):
msg = f"Not enough GPU memory to run {model} with dtype {dtype}"
else:
msg = ""
elif ("bloom" in model) and (dtype != torch.half):
msg = f"Bloom models only support half precision, cannot use dtype {dtype}"
return msg


Expand Down Expand Up @@ -160,7 +168,7 @@ def query(model_w_task):
def inf_kwargs(model_w_task):
model, task = model_w_task
if task == "text-generation":
return {"do_sample": False}
return {"do_sample": False, "max_length": 20}
else:
return {}

Expand Down Expand Up @@ -228,7 +236,9 @@ def test(
local_rank = int(os.getenv("LOCAL_RANK", "0"))

if "gpt-j-6B" in model and dtype == torch.half:
_model = AutoModelForCausalLM.from_pretrained(model)
_model = AutoModelForCausalLM.from_pretrained(model,
revision="float16",
torch_dtype=torch.float16)
tokenizer = AutoTokenizer.from_pretrained(model)
_model.half()
pipe = pipeline(
Expand Down Expand Up @@ -269,7 +279,9 @@ def test(
torch.cuda.synchronize()
ds_time = time.time() - start

if task == "text-generation":
# facebook/opt* and some bigscient/bloom* models are not matching
# baseline exactly, adding an exception to them for now
if ("opt" in model) or ("bloom" in model):
bs_output = pipe(query, **inf_kwargs)

# These performance tests are only measuring the time for a single
Expand All @@ -278,6 +290,58 @@ def test(
assert assert_fn(bs_output, ds_output)


@pytest.mark.seq_inference
@pytest.mark.parametrize("model_w_task",
[("gpt2",
"text-generation"),
("EleutherAI/gpt-neox-20b",
"text-generation"),
("bigscience/bloom-3b",
"text-generation")],
ids=["gpt2",
"gpt-neox",
"bloom"])
class TestMPSize(DistributedTest):
world_size = 4

def test(
self,
model_w_task,
dtype,
enable_cuda_graph,
query,
inf_kwargs,
assert_fn,
invalid_model_task_config,
):
if invalid_model_task_config:
pytest.skip(invalid_model_task_config)

model, task = model_w_task
local_rank = int(os.getenv("LOCAL_RANK", "0"))

# We have to load these large models on CPU with pipeline because not
# enough GPU memory
pipe = pipeline(task, model=model, device=-1, framework="pt")
bs_output = pipe(query, **inf_kwargs)

pipe.model = deepspeed.init_inference(
pipe.model,
mp_size=self.world_size,
dtype=dtype,
replace_method="auto",
replace_with_kernel_inject=True,
enable_cuda_graph=enable_cuda_graph,
)
# Switch device to GPU so that input tensors are not on CPU
pipe.device = torch.device(f"cuda:{local_rank}")
ds_output = pipe(query, **inf_kwargs)

print(local_rank, "baseline", bs_output)
print(local_rank, "deepspeed", ds_output)
assert assert_fn(bs_output, ds_output)


@pytest.mark.nightly
@pytest.mark.parametrize(
"model_family, model_name",
Expand Down