diff --git a/cpp/tests/batch_manager/trtGptModelRealDecoderTest.cpp b/cpp/tests/batch_manager/trtGptModelRealDecoderTest.cpp index e61a1add361..2fcef6a1759 100644 --- a/cpp/tests/batch_manager/trtGptModelRealDecoderTest.cpp +++ b/cpp/tests/batch_manager/trtGptModelRealDecoderTest.cpp @@ -1066,7 +1066,7 @@ INSTANTIATE_TEST_SUITE_P(GptV1Tests, ParamTest, BeamConfig{1, {1}}, BeamConfig{2, {2}} // , BeamConfig{2, {1, 2}} ), testing::Values(std::nullopt), // maxTokensInPagedKvCache - testing::Values(std::nullopt, 0.8), // freeGpuMemoryFraction + testing::Values(0.4), // freeGpuMemoryFraction testing::Values(false), // enableTrtOverlap testing::Values(false), // enableChunkedContext testing::Values(false), // enableStreamingMode @@ -1092,7 +1092,7 @@ INSTANTIATE_TEST_SUITE_P(GptV1RandomEndIdTests, ParamTest, BeamConfig{1, {1}}, BeamConfig{2, {2}} // , BeamConfig{2, {1, 2}} ), testing::Values(std::nullopt), // maxTokensInPagedKvCache - testing::Values(std::nullopt, 0.8), // freeGpuMemoryFraction + testing::Values(std::nullopt, 0.4), // freeGpuMemoryFraction testing::Values(false), // enableTrtOverlap testing::Values(false), // enableChunkedContext testing::Values(false), // enableStreamingMode @@ -1128,7 +1128,7 @@ INSTANTIATE_TEST_SUITE_P(GptTests, ParamTest, BeamConfig{1, {1}}, BeamConfig{2, {2}} // , BeamConfig{2, {1, 2}} ), testing::Values(std::nullopt, 1280), // maxTokensInPagedKvCache - testing::Values(std::nullopt, 0.8), // freeGpuMemoryFraction + testing::Values(std::nullopt, 0.4), // freeGpuMemoryFraction testing::Values(false, true), // enableTrtOverlap testing::Values(false), // enableChunkedContext testing::Values(false), // enableStreamingMode @@ -1154,7 +1154,7 @@ INSTANTIATE_TEST_SUITE_P(GptRandomEndIdTests, ParamTest, BeamConfig{1, {1}}, BeamConfig{2, {2}} // , BeamConfig{2, {1, 2}} ), testing::Values(std::nullopt, 1280), // maxTokensInPagedKvCache - testing::Values(std::nullopt, 0.8), // freeGpuMemoryFraction + testing::Values(std::nullopt, 0.4), // freeGpuMemoryFraction testing::Values(false, true), // enableTrtOverlap testing::Values(false), // enableChunkedContext testing::Values(false), // enableStreamingMode @@ -1178,7 +1178,7 @@ INSTANTIATE_TEST_SUITE_P(GptKVOffloadingTest, ParamTest, TrtGptModelIfbTestType::BULK, TrtGptModelIfbTestType::WAVEFRONT, TrtGptModelIfbTestType::RANDOM), testing::Values(BeamConfig{1, {1}}), testing::Values(256), // maxTokensInPagedKvCache - testing::Values(std::nullopt, 0.8), // freeGpuMemoryFraction + testing::Values(std::nullopt, 0.4), // freeGpuMemoryFraction testing::Values(false, true), // enableTrtOverlap testing::Values(false), // enableChunkedContext testing::Values(false), // enableStreamingMode @@ -1210,7 +1210,7 @@ INSTANTIATE_TEST_SUITE_P(GptCudaGraphTests, ParamTest, BeamConfig{1, {1}}, BeamConfig{2, {2}} // , BeamConfig{2, {1, 2}} ), testing::Values(std::nullopt), // maxTokensInPagedKvCache - testing::Values(std::nullopt), // freeGpuMemoryFraction + testing::Values(0.4), // freeGpuMemoryFraction testing::Values(false), // enableTrtOverlap testing::Values(false, true), // enableChunkedContext testing::Values(false), // enableStreamingMode @@ -1237,7 +1237,7 @@ INSTANTIATE_TEST_SUITE_P(GptSwitchBwTests, ParamTest, BeamConfig{2, {1}} // , BeamConfig{2, {1, 2}} ), testing::Values(std::nullopt), // maxTokensInPagedKvCache - testing::Values(std::nullopt), // freeGpuMemoryFraction + testing::Values(0.4), // freeGpuMemoryFraction testing::Values(false), // enableTrtOverlap testing::Values(true), // enableChunkedContext testing::Values(false), // enableStreamingMode @@ -1261,7 +1261,7 @@ INSTANTIATE_TEST_SUITE_P(GptNProfilesTests, ParamTest, BeamConfig{1, {1}}, BeamConfig{2, {2}} // , BeamConfig{2, {1, 2}} ), testing::Values(std::nullopt, 1280), // maxTokensInPagedKvCache - testing::Values(std::nullopt, 0.8), // freeGpuMemoryFraction + testing::Values(std::nullopt, 0.4), // freeGpuMemoryFraction testing::Values(false, true), // enableTrtOverlap testing::Values(false, true), // enableChunkedContext testing::Values(false), // enableStreamingMode @@ -1301,7 +1301,7 @@ INSTANTIATE_TEST_SUITE_P(GptSqTests, ParamTest, BeamConfig{1, {1}} //, BeamConfig{2, {2}} ), testing::Values(std::nullopt), // maxTokensInPagedKvCache - testing::Values(std::nullopt), // freeGpuMemoryFraction + testing::Values(0.4), // freeGpuMemoryFraction testing::Values(false), // enableTrtOverlap testing::Values(false), // enableChunkedContext testing::Values(false), // enableStreamingMode @@ -1325,7 +1325,7 @@ INSTANTIATE_TEST_SUITE_P(DISABLED_GptChunkedContextTests, ParamTest, testing::Values(TrtGptModelIfbTestType::BULK), // TrtGptModelIfbTestType testing::Values(BeamConfig{1, {1}}), // beam config testing::Values(257), // maxTokensInPagedKvCache - testing::Values(std::nullopt), // freeGpuMemoryFraction + testing::Values(0.4), // freeGpuMemoryFraction testing::Values(false), // enableTrtOverlap testing::Values(true), // enableChunkedContext testing::Values(false), // enableStreamingMode @@ -1355,7 +1355,7 @@ INSTANTIATE_TEST_SUITE_P(GptChunkedLongContextTests, ParamTest, TrtGptModelIfbTestType::RANDOM), // TrtGptModelIfbTestType testing::Values(BeamConfig{1, {1}}), // beam config testing::Values(std::nullopt, 1024), // maxTokensInPagedKvCache - testing::Values(std::nullopt), // freeGpuMemoryFraction + testing::Values(0.4), // freeGpuMemoryFraction testing::Values(false), // enableTrtOverlap testing::Values(true), // enableChunkedContext testing::Values(false), // enableStreamingMode @@ -1393,7 +1393,7 @@ INSTANTIATE_TEST_SUITE_P(GptDraftTests, ParamTest, TrtGptModelIfbTestType::BULK, TrtGptModelIfbTestType::WAVEFRONT, TrtGptModelIfbTestType::RANDOM), testing::Values(BeamConfig{1, {1}}), // beamConfig testing::Values(std::nullopt), // maxTokensInPagedKvCache - testing::Values(std::nullopt), // freeGpuMemoryFraction + testing::Values(0.4), // freeGpuMemoryFraction testing::Values(false), // enableTrtOverlap testing::Values(true), // enableChunkedContext testing::Values(false), // enableStreamingMode @@ -1419,7 +1419,7 @@ INSTANTIATE_TEST_SUITE_P(GptLogitsTests, ParamTest, TrtGptModelIfbTestType::RANDOM), // testType testing::Values(BeamConfig{1, {1}}), // beamConfig testing::Values(std::nullopt), // maxTokensInPagedKvCache - testing::Values(std::nullopt), // freeGpuMemoryFraction + testing::Values(0.4), // freeGpuMemoryFraction testing::Values(false), // enableTrtOverlap testing::Values(true), // enableChunkedContext testing::Values(false, true), // enableStreamingMode @@ -1445,7 +1445,7 @@ INSTANTIATE_TEST_SUITE_P(GptLogProbsTests, ParamTest, TrtGptModelIfbTestType::RANDOM), // testType testing::Values(BeamConfig{1, {1}}), // beamConfig testing::Values(std::nullopt), // maxTokensInPagedKvCache - testing::Values(std::nullopt), // freeGpuMemoryFraction + testing::Values(0.4), // freeGpuMemoryFraction testing::Values(false), // enableTrtOverlap testing::Values(true), // enableChunkedContext testing::Values(false), // enableStreamingMode @@ -1478,7 +1478,7 @@ INSTANTIATE_TEST_SUITE_P(GptjTests, ParamTest, BeamConfig{1, {1}}, BeamConfig{2, {2}} // , BeamConfig{2, {1, 2}} ), testing::Values(std::nullopt), // maxTokensInPagedKvCache - testing::Values(std::nullopt), // freeGpuMemoryFraction + testing::Values(0.4), // freeGpuMemoryFraction testing::Values(false), // enableTrtOverlap testing::Values(false), // enableChunkedContext testing::Values(false), // enableStreamingMode @@ -1507,7 +1507,7 @@ INSTANTIATE_TEST_SUITE_P(MambaTests, ParamTest, TrtGptModelIfbTestType::BULK, TrtGptModelIfbTestType::WAVEFRONT, TrtGptModelIfbTestType::RANDOM), testing::Values(BeamConfig{1, {1}}), testing::Values(std::nullopt), // maxTokensInPagedKvCache - testing::Values(std::nullopt), // freeGpuMemoryFraction + testing::Values(0.4), // freeGpuMemoryFraction testing::Values(false), // enableTrtOverlap testing::Values(false), // enableChunkedContext testing::Values(false), // enableStreamingMode @@ -1530,7 +1530,7 @@ INSTANTIATE_TEST_SUITE_P(RecurrentGemmaTests, ParamTest, TrtGptModelIfbTestType::BULK, TrtGptModelIfbTestType::WAVEFRONT, TrtGptModelIfbTestType::RANDOM), testing::Values(BeamConfig{1, {1}}), testing::Values(std::nullopt), // maxTokensInPagedKvCache - testing::Values(std::nullopt), // freeGpuMemoryFraction + testing::Values(0.4), // freeGpuMemoryFraction testing::Values(false), // enableTrtOverlap testing::Values(false), // enableChunkedContext testing::Values(false), // enableStreamingMode @@ -1574,7 +1574,7 @@ INSTANTIATE_TEST_SUITE_P(LlamaTests, ParamTest, BeamConfig{1, {1}}, BeamConfig{2, {2}} // , BeamConfig{2, {1, 2}} ), testing::Values(std::nullopt), // maxTokensInPagedKvCache - testing::Values(std::nullopt), // freeGpuMemoryFraction + testing::Values(0.4), // freeGpuMemoryFraction testing::Values(false), // enableTrtOverlap testing::Values(false), // enableChunkedContext testing::Values(false), // enableStreamingMode @@ -1597,7 +1597,7 @@ INSTANTIATE_TEST_SUITE_P(ChatGlmTests, ParamTest, TrtGptModelIfbTestType::BULK, TrtGptModelIfbTestType::WAVEFRONT, TrtGptModelIfbTestType::RANDOM), testing::Values(BeamConfig{1, {1}}), testing::Values(std::nullopt), // maxTokensInPagedKvCache - testing::Values(std::nullopt), // freeGpuMemoryFraction + testing::Values(0.4), // freeGpuMemoryFraction testing::Values(false), // enableTrtOverlap testing::Values(false, true), // enableChunkedContext testing::Values(false), // enableStreamingMode @@ -1621,7 +1621,7 @@ INSTANTIATE_TEST_SUITE_P(ChatGlm0Tests, ParamTest, TrtGptModelIfbTestType::BULK, TrtGptModelIfbTestType::WAVEFRONT, TrtGptModelIfbTestType::RANDOM), testing::Values(BeamConfig{1, {1}}), testing::Values(std::nullopt), // maxTokensInPagedKvCache - testing::Values(std::nullopt), // freeGpuMemoryFraction + testing::Values(0.4), // freeGpuMemoryFraction testing::Values(false), // enableTrtOverlap testing::Values(false), // enableChunkedContext testing::Values(false), // enableStreamingMode @@ -1646,7 +1646,7 @@ INSTANTIATE_TEST_SUITE_P(MedusaTests, ParamTest, testing::Values(TrtGptModelType::InflightFusedBatching), testing::Values(TrtGptModelIfbTestType::BULK), testing::Values(BeamConfig{1, {1}}), testing::Values(std::nullopt), // maxTokensInPagedKvCache - testing::Values(std::nullopt), // freeGpuMemoryFraction + testing::Values(0.4), // freeGpuMemoryFraction testing::Values(false), // enableTrtOverlap testing::Values(false, true), // enableChunkedContext testing::Values(false), // enableStreamingMode @@ -1670,7 +1670,7 @@ INSTANTIATE_TEST_SUITE_P(EagleTests, ParamTest, testing::Values(TrtGptModelIfbTestType::BULK, TrtGptModelIfbTestType::WAVEFRONT), testing::Values(BeamConfig{1, {1}}), testing::Values(std::nullopt), // maxTokensInPagedKvCache - testing::Values(std::nullopt), // freeGpuMemoryFraction + testing::Values(0.4), // freeGpuMemoryFraction testing::Values(false), // enableTrtOverlap testing::Values(false, true), // enableChunkedContext testing::Values(false), // enableStreamingMode @@ -1695,7 +1695,7 @@ INSTANTIATE_TEST_SUITE_P(LlamaLookaheadDecodingTests, ParamTest, TrtGptModelIfbTestType::BULK, TrtGptModelIfbTestType::WAVEFRONT, TrtGptModelIfbTestType::RANDOM), testing::Values(BeamConfig{1, {1}}), // beamConfig testing::Values(std::nullopt), // maxTokensInPagedKvCache - testing::Values(std::nullopt), // freeGpuMemoryFraction + testing::Values(0.4), // freeGpuMemoryFraction testing::Values(false), // enableTrtOverlap testing::Values(false), // enableChunkedContext testing::Values(false), // enableStreamingMode @@ -1720,7 +1720,7 @@ INSTANTIATE_TEST_SUITE_P(ExplicitDraftTokensDecodingTests, ParamTest, testing::Values(TrtGptModelType::InflightFusedBatching), testing::Values(TrtGptModelIfbTestType::BULK), testing::Values(BeamConfig{1, {1}}), // beamConfig testing::Values(std::nullopt), // maxTokensInPagedKvCache - testing::Values(std::nullopt), // freeGpuMemoryFraction + testing::Values(0.4), // freeGpuMemoryFraction testing::Values(false), // enableTrtOverlap testing::Values(false, true), // enableChunkedContext testing::Values(false), // enableStreamingMode @@ -1751,7 +1751,7 @@ INSTANTIATE_TEST_SUITE_P(GptjFP8Tests, ParamTest, BeamConfig{1, {1}} // , BeamConfig{2, {2}}, BeamConfig{2, {1, 2}} ), testing::Values(std::nullopt), // maxTokensInPagedKvCache - testing::Values(std::nullopt), // freeGpuMemoryFraction + testing::Values(0.4), // freeGpuMemoryFraction testing::Values(false), // enableTrtOverlap testing::Values(true), // enableChunkedContext testing::Values(false), // enableStreamingMode diff --git a/cpp/tests/executor/encDecTest.cpp b/cpp/tests/executor/encDecTest.cpp index 2ed8baba080..b6c0b7f7162 100644 --- a/cpp/tests/executor/encDecTest.cpp +++ b/cpp/tests/executor/encDecTest.cpp @@ -125,8 +125,8 @@ TEST_P(EncDecParamsTest, validEncDecCtor) std::filesystem::path encEnginePath = ENC_DEC_ENGINE_BASE / enginePathName / "encoder"; std::filesystem::path decEnginePath = ENC_DEC_ENGINE_BASE / enginePathName / "decoder"; ExecutorConfig executorConfig{}; - FloatType freeGpuMemoryFraction = 0.5f; - FloatType crossKvCacheFraction = 0.5f; + FloatType freeGpuMemoryFraction = 0.4f; + FloatType crossKvCacheFraction = 0.4f; KvCacheConfig kvCacheConfig{false, std::nullopt, std::nullopt, std::nullopt, freeGpuMemoryFraction}; kvCacheConfig.setCrossKvCacheFraction(crossKvCacheFraction); executorConfig.setKvCacheConfig(kvCacheConfig); diff --git a/cpp/tests/executor/executorTest.cpp b/cpp/tests/executor/executorTest.cpp index 0e0dce30ee7..44ff6ca605e 100644 --- a/cpp/tests/executor/executorTest.cpp +++ b/cpp/tests/executor/executorTest.cpp @@ -207,7 +207,7 @@ TEST_F(GptExecutorTest, ReturnAcceptedTokenLogits) // Enable kv cache reuse of executorConfig bool enableBlockReuse = true; - FloatType freeGpuMemoryFraction = 0.5; + FloatType freeGpuMemoryFraction = 0.4; auto kvCacheConfig = KvCacheConfig(enableBlockReuse, std::nullopt, std::nullopt, std::nullopt, freeGpuMemoryFraction); executorConfig.setKvCacheConfig(kvCacheConfig); diff --git a/jenkins/L0_Test.groovy b/jenkins/L0_Test.groovy index 82bbbbe7e37..5f2d2ee5e15 100644 --- a/jenkins/L0_Test.groovy +++ b/jenkins/L0_Test.groovy @@ -1238,6 +1238,7 @@ def launchTestJobs(pipeline, testFilter, dockerNode=null) "A10-TensorRT-[Post-Merge]-2": ["a10", "l0_a10", 2, 2], "A30-TensorRT-[Post-Merge]-1": ["a30", "l0_a30", 1, 2], "A30-TensorRT-[Post-Merge]-2": ["a30", "l0_a30", 2, 2], + "A30-CPP-[Post-Merge]-1": ["a30", "l0_a30", 1, 1], "A100X-TensorRT-[Post-Merge]-1": ["a100x", "l0_a100", 1, 2], "A100X-TensorRT-[Post-Merge]-2": ["a100x", "l0_a100", 2, 2], "L40S-TensorRT-[Post-Merge]-1": ["l40s", "l0_l40s", 1, 2], diff --git a/tests/integration/defs/cpp_common.py b/tests/integration/defs/cpp_common.py index 46a08c2bbc1..fcdb4b76283 100755 --- a/tests/integration/defs/cpp_common.py +++ b/tests/integration/defs/cpp_common.py @@ -737,15 +737,7 @@ def run_single_gpu_tests(build_dir: _pl.Path, if excluded_tests: ctest.extend(["-E", "|".join(excluded_tests)]) - gpt_tests = {"gpt", "gpt_session", "gpt_tests", "gpt_executor"} - - # gpt* tests are not parallelized as it would cause OOM because kv cache memory allocations - # exist in multiple running tests - if gpt_tests.intersection(test_list): - parallel = 1 - else: - parallel = default_test_parallel - + parallel = default_test_parallel if parallel_override := _os.environ.get("LLM_TEST_PARALLEL_OVERRIDE", None): parallel = int(parallel_override) diff --git a/tests/integration/test_lists/test-db/l0_a30.yml b/tests/integration/test_lists/test-db/l0_a30.yml index a08ba5adc58..ff552c2b9ad 100644 --- a/tests/integration/test_lists/test-db/l0_a30.yml +++ b/tests/integration/test_lists/test-db/l0_a30.yml @@ -42,9 +42,7 @@ l0_a30: - test_cpp.py::test_unit_tests[80] - test_cpp.py::test_model[gpt-80] - test_cpp.py::test_model[gpt_executor-80] - - test_cpp.py::test_model[gpt_session-80] - test_cpp.py::test_model[gpt_tests-80] - - test_cpp.py::test_benchmarks[gpt-80] - condition: ranges: system_gpu_count: @@ -145,3 +143,18 @@ l0_a30: - examples/test_draft_target_model.py::test_llm_draft_target_model_1gpu[no_streaming-gpt2-use_cpp_session-use_tokens-draft_len_4-float16-bs2] # 1 min - examples/test_draft_target_model.py::test_llm_draft_target_model_1gpu[no_streaming-gpt2-use_cpp_session-use_logits-draft_len_4-float16-bs2] # 1 min - examples/test_prompt_lookup.py::test_llm_prompt_lookup_1gpu[no_streaming-gpt2-use_cpp_session-use_tokens-max_matching_ngram_size_2-prompt_lookup_num_tokens_8-float16-bs2] # 1 min +- condition: + ranges: + system_gpu_count: + gte: 1 + lte: 1 + wildcards: + gpu: + - '*a30*' + linux_distribution_name: ubuntu* + terms: + stage: post_merge + backend: cpp + tests: + - test_cpp.py::test_model[gpt_session-80] + - test_cpp.py::test_benchmarks[gpt-80]