diff --git a/.github/workflows/amd.yml b/.github/workflows/amd.yml index 975339952c12..97b96f2d5875 100644 --- a/.github/workflows/amd.yml +++ b/.github/workflows/amd.yml @@ -28,7 +28,6 @@ jobs: # Runs a single command using the runners shell - name: environment run: | - echo "JobID: $AISC_NODE_INSTANCE_ID" rocm-smi --showhw which python python --version diff --git a/.github/workflows/nv-accelerate-v100.yml b/.github/workflows/nv-accelerate-v100.yml index 436e11b3f822..cc1abfc5717d 100644 --- a/.github/workflows/nv-accelerate-v100.yml +++ b/.github/workflows/nv-accelerate-v100.yml @@ -24,7 +24,6 @@ jobs: - name: environment run: | - echo "JobID: $AISC_NODE_INSTANCE_ID" nvidia-smi which python python --version diff --git a/.github/workflows/nv-inference.yml b/.github/workflows/nv-inference.yml index dc00682edd46..681fc90be78f 100644 --- a/.github/workflows/nv-inference.yml +++ b/.github/workflows/nv-inference.yml @@ -24,7 +24,6 @@ jobs: - name: environment run: | - echo "JobID: $AISC_NODE_INSTANCE_ID" nvidia-smi which python python --version @@ -41,7 +40,7 @@ jobs: git clone https://github.com/huggingface/transformers cd transformers # if needed switch to the last known good SHA until transformers@master is fixed - git checkout v4.21.2 + # git checkout 1cc453d33 git rev-parse --short HEAD pip uninstall --yes transformers pip install . @@ -62,5 +61,4 @@ jobs: if [[ -d ./torch-extensions ]]; then rm -rf ./torch-extensions; fi cd tests EXPECTED_TORCH=$(pip index versions torch | grep -oP -m1 "^\s*LATEST.*\s\K\d+\.\d+") - TRANSFORMERS_CACHE=/blob/transformers_cache/ TORCH_EXTENSIONS_DIR=./torch-extensions pytest --color=yes --durations=0 --verbose -m 'seq_inference' unit/ --torch_ver=$EXPECTED_TORCH --cuda_ver="11.3" TRANSFORMERS_CACHE=/blob/transformers_cache/ TORCH_EXTENSIONS_DIR=./torch-extensions pytest --color=yes --durations=0 -n 4 --verbose -m 'inference' unit/ --torch_ver=$EXPECTED_TORCH --cuda_ver="11.3" diff --git a/.github/workflows/nv-lightning-v100.yml b/.github/workflows/nv-lightning-v100.yml index a63f6b75e769..9fe1043b96d0 100644 --- a/.github/workflows/nv-lightning-v100.yml +++ b/.github/workflows/nv-lightning-v100.yml @@ -24,7 +24,6 @@ jobs: - name: environment run: | - echo "JobID: $AISC_NODE_INSTANCE_ID" nvidia-smi which python python --version diff --git a/.github/workflows/nv-nightly.yml b/.github/workflows/nv-nightly.yml index 40abe198c7cd..98afd75105a3 100644 --- a/.github/workflows/nv-nightly.yml +++ b/.github/workflows/nv-nightly.yml @@ -17,7 +17,6 @@ jobs: - name: environment run: | - echo "JobID: $AISC_NODE_INSTANCE_ID" nvidia-smi which python python --version diff --git a/.github/workflows/nv-torch-latest-v100.yml b/.github/workflows/nv-torch-latest-v100.yml index c8a3c32bdc36..1dc535f9b327 100644 --- a/.github/workflows/nv-torch-latest-v100.yml +++ b/.github/workflows/nv-torch-latest-v100.yml @@ -24,7 +24,6 @@ jobs: - name: environment run: | - echo "JobID: $AISC_NODE_INSTANCE_ID" nvidia-smi which python python --version diff --git a/.github/workflows/nv-torch-nightly-v100.yml b/.github/workflows/nv-torch-nightly-v100.yml index d456483c72f5..e1c916afba2d 100644 --- a/.github/workflows/nv-torch-nightly-v100.yml +++ b/.github/workflows/nv-torch-nightly-v100.yml @@ -17,7 +17,6 @@ jobs: - name: environment run: | - echo "JobID: $AISC_NODE_INSTANCE_ID" nvidia-smi which python python --version diff --git a/.github/workflows/nv-torch12-p40.yml b/.github/workflows/nv-torch12-p40.yml index 29b9f891c3bb..944ba3beb19d 100644 --- a/.github/workflows/nv-torch12-p40.yml +++ b/.github/workflows/nv-torch12-p40.yml @@ -24,7 +24,6 @@ jobs: - name: environment run: | - echo "JobID: $AISC_NODE_INSTANCE_ID" nvidia-smi which python python --version diff --git a/.github/workflows/nv-torch18-v100.yml b/.github/workflows/nv-torch18-v100.yml index 2971416ebd81..b512ea29113f 100644 --- a/.github/workflows/nv-torch18-v100.yml +++ b/.github/workflows/nv-torch18-v100.yml @@ -24,7 +24,6 @@ jobs: - name: environment run: | - echo "JobID: $AISC_NODE_INSTANCE_ID" nvidia-smi which python python --version diff --git a/.github/workflows/nv-transformers-v100.yml b/.github/workflows/nv-transformers-v100.yml index bfc9919be1f9..945457b304c5 100644 --- a/.github/workflows/nv-transformers-v100.yml +++ b/.github/workflows/nv-transformers-v100.yml @@ -24,7 +24,6 @@ jobs: - name: environment run: | - echo "JobID: $AISC_NODE_INSTANCE_ID" nvidia-smi which python python --version diff --git a/tests/pytest.ini b/tests/pytest.ini index b7ee315be801..a52a49e5bbc3 100644 --- a/tests/pytest.ini +++ b/tests/pytest.ini @@ -1,7 +1,6 @@ [pytest] -addopts = -m "not sequential and not nightly and not inference and not seq_inference" +addopts = -m "not sequential and not nightly and not inference" markers = sequential:Tests that need to be run sequentially inference:Inference model tests - seq_inference:Inference model tests to run sequentially nightly:Tests that should be run nightly diff --git a/tests/unit/inference/test_inference.py b/tests/unit/inference/test_inference.py index 1b1efdc595fe..cdc3e83232de 100644 --- a/tests/unit/inference/test_inference.py +++ b/tests/unit/inference/test_inference.py @@ -52,7 +52,7 @@ def lm_eval_imports(): "distilgpt2", "Norod78/hebrew-bad_wiki-gpt_neo-tiny", "EleutherAI/gpt-j-6B", - "bigscience/bloom-560m", + "bigscience/bloom-350m", ] _opt_models = [ "facebook/opt-125m", # 125m, 1.7B, ..., 175B variants have the same model architecture. @@ -111,7 +111,6 @@ def enable_cuda_graph(request): @pytest.fixture() def invalid_model_task_config(model_w_task, dtype, enable_cuda_graph): model, task = model_w_task - msg = "" if pkg_version.parse(torch.__version__) <= pkg_version.parse("1.2"): msg = "DS inference injection doesn't work well on older torch versions" elif model not in pytest.all_models[task]: @@ -121,17 +120,10 @@ def invalid_model_task_config(model_w_task, dtype, enable_cuda_graph): elif enable_cuda_graph and pkg_version.parse( torch.__version__) < pkg_version.parse("1.10"): msg = "CUDA Graph is only available in torch versions >= 1.10" - elif "gpt-j-6B" in model: - if dtype != torch.half: - msg = f"Not enough GPU memory to run {model} with dtype {dtype}" - elif enable_cuda_graph: - msg = f"Not enough GPU memory to run {model} with CUDA Graph enabled" - elif "gpt-neox-20b" in model: # TODO: remove this when neox issues resolved - msg = "Skipping gpt-neox-20b for now" - elif ("gpt-neox-20b" in model) and (dtype != torch.half): + elif ("gpt-j-6B" in model) and (dtype == torch.float): msg = f"Not enough GPU memory to run {model} with dtype {dtype}" - elif ("bloom" in model) and (dtype != torch.half): - msg = f"Bloom models only support half precision, cannot use dtype {dtype}" + else: + msg = "" return msg @@ -168,7 +160,7 @@ def query(model_w_task): def inf_kwargs(model_w_task): model, task = model_w_task if task == "text-generation": - return {"do_sample": False, "max_length": 20} + return {"do_sample": False} else: return {} @@ -236,9 +228,7 @@ def test( local_rank = int(os.getenv("LOCAL_RANK", "0")) if "gpt-j-6B" in model and dtype == torch.half: - _model = AutoModelForCausalLM.from_pretrained(model, - revision="float16", - torch_dtype=torch.float16) + _model = AutoModelForCausalLM.from_pretrained(model) tokenizer = AutoTokenizer.from_pretrained(model) _model.half() pipe = pipeline( @@ -279,9 +269,7 @@ def test( torch.cuda.synchronize() ds_time = time.time() - start - # facebook/opt* and some bigscient/bloom* models are not matching - # baseline exactly, adding an exception to them for now - if ("opt" in model) or ("bloom" in model): + if task == "text-generation": bs_output = pipe(query, **inf_kwargs) # These performance tests are only measuring the time for a single @@ -290,58 +278,6 @@ def test( assert assert_fn(bs_output, ds_output) -@pytest.mark.seq_inference -@pytest.mark.parametrize("model_w_task", - [("gpt2", - "text-generation"), - ("EleutherAI/gpt-neox-20b", - "text-generation"), - ("bigscience/bloom-3b", - "text-generation")], - ids=["gpt2", - "gpt-neox", - "bloom"]) -class TestMPSize(DistributedTest): - world_size = 4 - - def test( - self, - model_w_task, - dtype, - enable_cuda_graph, - query, - inf_kwargs, - assert_fn, - invalid_model_task_config, - ): - if invalid_model_task_config: - pytest.skip(invalid_model_task_config) - - model, task = model_w_task - local_rank = int(os.getenv("LOCAL_RANK", "0")) - - # We have to load these large models on CPU with pipeline because not - # enough GPU memory - pipe = pipeline(task, model=model, device=-1, framework="pt") - bs_output = pipe(query, **inf_kwargs) - - pipe.model = deepspeed.init_inference( - pipe.model, - mp_size=self.world_size, - dtype=dtype, - replace_method="auto", - replace_with_kernel_inject=True, - enable_cuda_graph=enable_cuda_graph, - ) - # Switch device to GPU so that input tensors are not on CPU - pipe.device = torch.device(f"cuda:{local_rank}") - ds_output = pipe(query, **inf_kwargs) - - print(local_rank, "baseline", bs_output) - print(local_rank, "deepspeed", ds_output) - assert assert_fn(bs_output, ds_output) - - @pytest.mark.nightly @pytest.mark.parametrize( "model_family, model_name",