Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
50 changes: 25 additions & 25 deletions cpp/tests/batch_manager/trtGptModelRealDecoderTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1066,7 +1066,7 @@ INSTANTIATE_TEST_SUITE_P(GptV1Tests, ParamTest,
BeamConfig{1, {1}}, BeamConfig{2, {2}} // , BeamConfig{2, {1, 2}}
),
testing::Values(std::nullopt), // maxTokensInPagedKvCache
testing::Values(std::nullopt, 0.8), // freeGpuMemoryFraction
testing::Values(0.4), // freeGpuMemoryFraction
testing::Values(false), // enableTrtOverlap
testing::Values(false), // enableChunkedContext
testing::Values(false), // enableStreamingMode
Expand All @@ -1092,7 +1092,7 @@ INSTANTIATE_TEST_SUITE_P(GptV1RandomEndIdTests, ParamTest,
BeamConfig{1, {1}}, BeamConfig{2, {2}} // , BeamConfig{2, {1, 2}}
),
testing::Values(std::nullopt), // maxTokensInPagedKvCache
testing::Values(std::nullopt, 0.8), // freeGpuMemoryFraction
testing::Values(std::nullopt, 0.4), // freeGpuMemoryFraction
testing::Values(false), // enableTrtOverlap
testing::Values(false), // enableChunkedContext
testing::Values(false), // enableStreamingMode
Expand Down Expand Up @@ -1128,7 +1128,7 @@ INSTANTIATE_TEST_SUITE_P(GptTests, ParamTest,
BeamConfig{1, {1}}, BeamConfig{2, {2}} // , BeamConfig{2, {1, 2}}
),
testing::Values(std::nullopt, 1280), // maxTokensInPagedKvCache
testing::Values(std::nullopt, 0.8), // freeGpuMemoryFraction
testing::Values(std::nullopt, 0.4), // freeGpuMemoryFraction
testing::Values(false, true), // enableTrtOverlap
testing::Values(false), // enableChunkedContext
testing::Values(false), // enableStreamingMode
Expand All @@ -1154,7 +1154,7 @@ INSTANTIATE_TEST_SUITE_P(GptRandomEndIdTests, ParamTest,
BeamConfig{1, {1}}, BeamConfig{2, {2}} // , BeamConfig{2, {1, 2}}
),
testing::Values(std::nullopt, 1280), // maxTokensInPagedKvCache
testing::Values(std::nullopt, 0.8), // freeGpuMemoryFraction
testing::Values(std::nullopt, 0.4), // freeGpuMemoryFraction
testing::Values(false, true), // enableTrtOverlap
testing::Values(false), // enableChunkedContext
testing::Values(false), // enableStreamingMode
Expand All @@ -1178,7 +1178,7 @@ INSTANTIATE_TEST_SUITE_P(GptKVOffloadingTest, ParamTest,
TrtGptModelIfbTestType::BULK, TrtGptModelIfbTestType::WAVEFRONT, TrtGptModelIfbTestType::RANDOM),
testing::Values(BeamConfig{1, {1}}),
testing::Values(256), // maxTokensInPagedKvCache
testing::Values(std::nullopt, 0.8), // freeGpuMemoryFraction
testing::Values(std::nullopt, 0.4), // freeGpuMemoryFraction
testing::Values(false, true), // enableTrtOverlap
testing::Values(false), // enableChunkedContext
testing::Values(false), // enableStreamingMode
Expand Down Expand Up @@ -1210,7 +1210,7 @@ INSTANTIATE_TEST_SUITE_P(GptCudaGraphTests, ParamTest,
BeamConfig{1, {1}}, BeamConfig{2, {2}} // , BeamConfig{2, {1, 2}}
),
testing::Values(std::nullopt), // maxTokensInPagedKvCache
testing::Values(std::nullopt), // freeGpuMemoryFraction
testing::Values(0.4), // freeGpuMemoryFraction
testing::Values(false), // enableTrtOverlap
testing::Values(false, true), // enableChunkedContext
testing::Values(false), // enableStreamingMode
Expand All @@ -1237,7 +1237,7 @@ INSTANTIATE_TEST_SUITE_P(GptSwitchBwTests, ParamTest,
BeamConfig{2, {1}} // , BeamConfig{2, {1, 2}}
),
testing::Values(std::nullopt), // maxTokensInPagedKvCache
testing::Values(std::nullopt), // freeGpuMemoryFraction
testing::Values(0.4), // freeGpuMemoryFraction
testing::Values(false), // enableTrtOverlap
testing::Values(true), // enableChunkedContext
testing::Values(false), // enableStreamingMode
Expand All @@ -1261,7 +1261,7 @@ INSTANTIATE_TEST_SUITE_P(GptNProfilesTests, ParamTest,
BeamConfig{1, {1}}, BeamConfig{2, {2}} // , BeamConfig{2, {1, 2}}
),
testing::Values(std::nullopt, 1280), // maxTokensInPagedKvCache
testing::Values(std::nullopt, 0.8), // freeGpuMemoryFraction
testing::Values(std::nullopt, 0.4), // freeGpuMemoryFraction
testing::Values(false, true), // enableTrtOverlap
testing::Values(false, true), // enableChunkedContext
testing::Values(false), // enableStreamingMode
Expand Down Expand Up @@ -1301,7 +1301,7 @@ INSTANTIATE_TEST_SUITE_P(GptSqTests, ParamTest,
BeamConfig{1, {1}} //, BeamConfig{2, {2}}
),
testing::Values(std::nullopt), // maxTokensInPagedKvCache
testing::Values(std::nullopt), // freeGpuMemoryFraction
testing::Values(0.4), // freeGpuMemoryFraction
testing::Values(false), // enableTrtOverlap
testing::Values(false), // enableChunkedContext
testing::Values(false), // enableStreamingMode
Expand All @@ -1325,7 +1325,7 @@ INSTANTIATE_TEST_SUITE_P(DISABLED_GptChunkedContextTests, ParamTest,
testing::Values(TrtGptModelIfbTestType::BULK), // TrtGptModelIfbTestType
testing::Values(BeamConfig{1, {1}}), // beam config
testing::Values(257), // maxTokensInPagedKvCache
testing::Values(std::nullopt), // freeGpuMemoryFraction
testing::Values(0.4), // freeGpuMemoryFraction
testing::Values(false), // enableTrtOverlap
testing::Values(true), // enableChunkedContext
testing::Values(false), // enableStreamingMode
Expand Down Expand Up @@ -1355,7 +1355,7 @@ INSTANTIATE_TEST_SUITE_P(GptChunkedLongContextTests, ParamTest,
TrtGptModelIfbTestType::RANDOM), // TrtGptModelIfbTestType
testing::Values(BeamConfig{1, {1}}), // beam config
testing::Values(std::nullopt, 1024), // maxTokensInPagedKvCache
testing::Values(std::nullopt), // freeGpuMemoryFraction
testing::Values(0.4), // freeGpuMemoryFraction
testing::Values(false), // enableTrtOverlap
testing::Values(true), // enableChunkedContext
testing::Values(false), // enableStreamingMode
Expand Down Expand Up @@ -1393,7 +1393,7 @@ INSTANTIATE_TEST_SUITE_P(GptDraftTests, ParamTest,
TrtGptModelIfbTestType::BULK, TrtGptModelIfbTestType::WAVEFRONT, TrtGptModelIfbTestType::RANDOM),
testing::Values(BeamConfig{1, {1}}), // beamConfig
testing::Values(std::nullopt), // maxTokensInPagedKvCache
testing::Values(std::nullopt), // freeGpuMemoryFraction
testing::Values(0.4), // freeGpuMemoryFraction
testing::Values(false), // enableTrtOverlap
testing::Values(true), // enableChunkedContext
testing::Values(false), // enableStreamingMode
Expand All @@ -1419,7 +1419,7 @@ INSTANTIATE_TEST_SUITE_P(GptLogitsTests, ParamTest,
TrtGptModelIfbTestType::RANDOM), // testType
testing::Values(BeamConfig{1, {1}}), // beamConfig
testing::Values(std::nullopt), // maxTokensInPagedKvCache
testing::Values(std::nullopt), // freeGpuMemoryFraction
testing::Values(0.4), // freeGpuMemoryFraction
testing::Values(false), // enableTrtOverlap
testing::Values(true), // enableChunkedContext
testing::Values(false, true), // enableStreamingMode
Expand All @@ -1445,7 +1445,7 @@ INSTANTIATE_TEST_SUITE_P(GptLogProbsTests, ParamTest,
TrtGptModelIfbTestType::RANDOM), // testType
testing::Values(BeamConfig{1, {1}}), // beamConfig
testing::Values(std::nullopt), // maxTokensInPagedKvCache
testing::Values(std::nullopt), // freeGpuMemoryFraction
testing::Values(0.4), // freeGpuMemoryFraction
testing::Values(false), // enableTrtOverlap
testing::Values(true), // enableChunkedContext
testing::Values(false), // enableStreamingMode
Expand Down Expand Up @@ -1478,7 +1478,7 @@ INSTANTIATE_TEST_SUITE_P(GptjTests, ParamTest,
BeamConfig{1, {1}}, BeamConfig{2, {2}} // , BeamConfig{2, {1, 2}}
),
testing::Values(std::nullopt), // maxTokensInPagedKvCache
testing::Values(std::nullopt), // freeGpuMemoryFraction
testing::Values(0.4), // freeGpuMemoryFraction
testing::Values(false), // enableTrtOverlap
testing::Values(false), // enableChunkedContext
testing::Values(false), // enableStreamingMode
Expand Down Expand Up @@ -1507,7 +1507,7 @@ INSTANTIATE_TEST_SUITE_P(MambaTests, ParamTest,
TrtGptModelIfbTestType::BULK, TrtGptModelIfbTestType::WAVEFRONT, TrtGptModelIfbTestType::RANDOM),
testing::Values(BeamConfig{1, {1}}),
testing::Values(std::nullopt), // maxTokensInPagedKvCache
testing::Values(std::nullopt), // freeGpuMemoryFraction
testing::Values(0.4), // freeGpuMemoryFraction
testing::Values(false), // enableTrtOverlap
testing::Values(false), // enableChunkedContext
testing::Values(false), // enableStreamingMode
Expand All @@ -1530,7 +1530,7 @@ INSTANTIATE_TEST_SUITE_P(RecurrentGemmaTests, ParamTest,
TrtGptModelIfbTestType::BULK, TrtGptModelIfbTestType::WAVEFRONT, TrtGptModelIfbTestType::RANDOM),
testing::Values(BeamConfig{1, {1}}),
testing::Values(std::nullopt), // maxTokensInPagedKvCache
testing::Values(std::nullopt), // freeGpuMemoryFraction
testing::Values(0.4), // freeGpuMemoryFraction
testing::Values(false), // enableTrtOverlap
testing::Values(false), // enableChunkedContext
testing::Values(false), // enableStreamingMode
Expand Down Expand Up @@ -1574,7 +1574,7 @@ INSTANTIATE_TEST_SUITE_P(LlamaTests, ParamTest,
BeamConfig{1, {1}}, BeamConfig{2, {2}} // , BeamConfig{2, {1, 2}}
),
testing::Values(std::nullopt), // maxTokensInPagedKvCache
testing::Values(std::nullopt), // freeGpuMemoryFraction
testing::Values(0.4), // freeGpuMemoryFraction
testing::Values(false), // enableTrtOverlap
testing::Values(false), // enableChunkedContext
testing::Values(false), // enableStreamingMode
Expand All @@ -1597,7 +1597,7 @@ INSTANTIATE_TEST_SUITE_P(ChatGlmTests, ParamTest,
TrtGptModelIfbTestType::BULK, TrtGptModelIfbTestType::WAVEFRONT, TrtGptModelIfbTestType::RANDOM),
testing::Values(BeamConfig{1, {1}}),
testing::Values(std::nullopt), // maxTokensInPagedKvCache
testing::Values(std::nullopt), // freeGpuMemoryFraction
testing::Values(0.4), // freeGpuMemoryFraction
testing::Values(false), // enableTrtOverlap
testing::Values(false, true), // enableChunkedContext
testing::Values(false), // enableStreamingMode
Expand All @@ -1621,7 +1621,7 @@ INSTANTIATE_TEST_SUITE_P(ChatGlm0Tests, ParamTest,
TrtGptModelIfbTestType::BULK, TrtGptModelIfbTestType::WAVEFRONT, TrtGptModelIfbTestType::RANDOM),
testing::Values(BeamConfig{1, {1}}),
testing::Values(std::nullopt), // maxTokensInPagedKvCache
testing::Values(std::nullopt), // freeGpuMemoryFraction
testing::Values(0.4), // freeGpuMemoryFraction
testing::Values(false), // enableTrtOverlap
testing::Values(false), // enableChunkedContext
testing::Values(false), // enableStreamingMode
Expand All @@ -1646,7 +1646,7 @@ INSTANTIATE_TEST_SUITE_P(MedusaTests, ParamTest,
testing::Values(TrtGptModelType::InflightFusedBatching), testing::Values(TrtGptModelIfbTestType::BULK),
testing::Values(BeamConfig{1, {1}}),
testing::Values(std::nullopt), // maxTokensInPagedKvCache
testing::Values(std::nullopt), // freeGpuMemoryFraction
testing::Values(0.4), // freeGpuMemoryFraction
testing::Values(false), // enableTrtOverlap
testing::Values(false, true), // enableChunkedContext
testing::Values(false), // enableStreamingMode
Expand All @@ -1670,7 +1670,7 @@ INSTANTIATE_TEST_SUITE_P(EagleTests, ParamTest,
testing::Values(TrtGptModelIfbTestType::BULK, TrtGptModelIfbTestType::WAVEFRONT),
testing::Values(BeamConfig{1, {1}}),
testing::Values(std::nullopt), // maxTokensInPagedKvCache
testing::Values(std::nullopt), // freeGpuMemoryFraction
testing::Values(0.4), // freeGpuMemoryFraction
testing::Values(false), // enableTrtOverlap
testing::Values(false, true), // enableChunkedContext
testing::Values(false), // enableStreamingMode
Expand All @@ -1695,7 +1695,7 @@ INSTANTIATE_TEST_SUITE_P(LlamaLookaheadDecodingTests, ParamTest,
TrtGptModelIfbTestType::BULK, TrtGptModelIfbTestType::WAVEFRONT, TrtGptModelIfbTestType::RANDOM),
testing::Values(BeamConfig{1, {1}}), // beamConfig
testing::Values(std::nullopt), // maxTokensInPagedKvCache
testing::Values(std::nullopt), // freeGpuMemoryFraction
testing::Values(0.4), // freeGpuMemoryFraction
testing::Values(false), // enableTrtOverlap
testing::Values(false), // enableChunkedContext
testing::Values(false), // enableStreamingMode
Expand All @@ -1720,7 +1720,7 @@ INSTANTIATE_TEST_SUITE_P(ExplicitDraftTokensDecodingTests, ParamTest,
testing::Values(TrtGptModelType::InflightFusedBatching), testing::Values(TrtGptModelIfbTestType::BULK),
testing::Values(BeamConfig{1, {1}}), // beamConfig
testing::Values(std::nullopt), // maxTokensInPagedKvCache
testing::Values(std::nullopt), // freeGpuMemoryFraction
testing::Values(0.4), // freeGpuMemoryFraction
testing::Values(false), // enableTrtOverlap
testing::Values(false, true), // enableChunkedContext
testing::Values(false), // enableStreamingMode
Expand Down Expand Up @@ -1751,7 +1751,7 @@ INSTANTIATE_TEST_SUITE_P(GptjFP8Tests, ParamTest,
BeamConfig{1, {1}} // , BeamConfig{2, {2}}, BeamConfig{2, {1, 2}}
),
testing::Values(std::nullopt), // maxTokensInPagedKvCache
testing::Values(std::nullopt), // freeGpuMemoryFraction
testing::Values(0.4), // freeGpuMemoryFraction
testing::Values(false), // enableTrtOverlap
testing::Values(true), // enableChunkedContext
testing::Values(false), // enableStreamingMode
Expand Down
4 changes: 2 additions & 2 deletions cpp/tests/executor/encDecTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -125,8 +125,8 @@ TEST_P(EncDecParamsTest, validEncDecCtor)
std::filesystem::path encEnginePath = ENC_DEC_ENGINE_BASE / enginePathName / "encoder";
std::filesystem::path decEnginePath = ENC_DEC_ENGINE_BASE / enginePathName / "decoder";
ExecutorConfig executorConfig{};
FloatType freeGpuMemoryFraction = 0.5f;
FloatType crossKvCacheFraction = 0.5f;
FloatType freeGpuMemoryFraction = 0.4f;
FloatType crossKvCacheFraction = 0.4f;
KvCacheConfig kvCacheConfig{false, std::nullopt, std::nullopt, std::nullopt, freeGpuMemoryFraction};
kvCacheConfig.setCrossKvCacheFraction(crossKvCacheFraction);
executorConfig.setKvCacheConfig(kvCacheConfig);
Expand Down
2 changes: 1 addition & 1 deletion cpp/tests/executor/executorTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -207,7 +207,7 @@ TEST_F(GptExecutorTest, ReturnAcceptedTokenLogits)

// Enable kv cache reuse of executorConfig
bool enableBlockReuse = true;
FloatType freeGpuMemoryFraction = 0.5;
FloatType freeGpuMemoryFraction = 0.4;
auto kvCacheConfig
= KvCacheConfig(enableBlockReuse, std::nullopt, std::nullopt, std::nullopt, freeGpuMemoryFraction);
executorConfig.setKvCacheConfig(kvCacheConfig);
Expand Down
1 change: 1 addition & 0 deletions jenkins/L0_Test.groovy
Original file line number Diff line number Diff line change
Expand Up @@ -1238,6 +1238,7 @@ def launchTestJobs(pipeline, testFilter, dockerNode=null)
"A10-TensorRT-[Post-Merge]-2": ["a10", "l0_a10", 2, 2],
"A30-TensorRT-[Post-Merge]-1": ["a30", "l0_a30", 1, 2],
"A30-TensorRT-[Post-Merge]-2": ["a30", "l0_a30", 2, 2],
"A30-CPP-[Post-Merge]-1": ["a30", "l0_a30", 1, 1],
"A100X-TensorRT-[Post-Merge]-1": ["a100x", "l0_a100", 1, 2],
"A100X-TensorRT-[Post-Merge]-2": ["a100x", "l0_a100", 2, 2],
"L40S-TensorRT-[Post-Merge]-1": ["l40s", "l0_l40s", 1, 2],
Expand Down
10 changes: 1 addition & 9 deletions tests/integration/defs/cpp_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -737,15 +737,7 @@ def run_single_gpu_tests(build_dir: _pl.Path,
if excluded_tests:
ctest.extend(["-E", "|".join(excluded_tests)])

gpt_tests = {"gpt", "gpt_session", "gpt_tests", "gpt_executor"}

# gpt* tests are not parallelized as it would cause OOM because kv cache memory allocations
# exist in multiple running tests
if gpt_tests.intersection(test_list):
parallel = 1
else:
parallel = default_test_parallel

parallel = default_test_parallel
if parallel_override := _os.environ.get("LLM_TEST_PARALLEL_OVERRIDE",
None):
parallel = int(parallel_override)
Expand Down
17 changes: 15 additions & 2 deletions tests/integration/test_lists/test-db/l0_a30.yml
Original file line number Diff line number Diff line change
Expand Up @@ -42,9 +42,7 @@ l0_a30:
- test_cpp.py::test_unit_tests[80]
- test_cpp.py::test_model[gpt-80]
- test_cpp.py::test_model[gpt_executor-80]
- test_cpp.py::test_model[gpt_session-80]
- test_cpp.py::test_model[gpt_tests-80]
- test_cpp.py::test_benchmarks[gpt-80]
- condition:
ranges:
system_gpu_count:
Expand Down Expand Up @@ -145,3 +143,18 @@ l0_a30:
- examples/test_draft_target_model.py::test_llm_draft_target_model_1gpu[no_streaming-gpt2-use_cpp_session-use_tokens-draft_len_4-float16-bs2] # 1 min
- examples/test_draft_target_model.py::test_llm_draft_target_model_1gpu[no_streaming-gpt2-use_cpp_session-use_logits-draft_len_4-float16-bs2] # 1 min
- examples/test_prompt_lookup.py::test_llm_prompt_lookup_1gpu[no_streaming-gpt2-use_cpp_session-use_tokens-max_matching_ngram_size_2-prompt_lookup_num_tokens_8-float16-bs2] # 1 min
- condition:
ranges:
system_gpu_count:
gte: 1
lte: 1
wildcards:
gpu:
- '*a30*'
linux_distribution_name: ubuntu*
terms:
stage: post_merge
backend: cpp
tests:
- test_cpp.py::test_model[gpt_session-80]
- test_cpp.py::test_benchmarks[gpt-80]