refactor: Add setBeamWidth method to DecoderState

Funatiq · Funatiq · commit 3830a1e699d7 · 2025-08-02T14:11:00.000Z
- Introduced setBeamWidth method in DecoderState to allow setting the beam width for specific requests in a batch.
- Updated CreateNewDecoderRequests to utilize the new setBeamWidth method, improving code clarity and maintainability.

Signed-off-by: Robin Kobus &lt;19427718+Funatiq@users.noreply.github.com&gt;
diff --git a/cpp/include/tensorrt_llm/runtime/decoderState.h b/cpp/include/tensorrt_llm/runtime/decoderState.h
@@ -173,6 +173,11 @@ class DecoderState
     //! @brief Workspace for beam search in streaming mode.
     [[nodiscard]] BeamSearchBuffers const& getBeamSearchBuffers() const;
 
+    //! @brief Set the beam width for a specific request in the batch.
+    //! @param batchIdx The index of the request in the batch.
+    //! @param beamWidth The beam width for the specified request.
+    void setBeamWidth(SizeType32 batchIdx, SizeType32 beamWidth);
+
     //! @brief Cache indirection input for beam search.
     [[nodiscard]] TensorPtr getCacheIndirectionInput() const;
 
diff --git a/cpp/tensorrt_llm/batch_manager/createNewDecoderRequests.cpp b/cpp/tensorrt_llm/batch_manager/createNewDecoderRequests.cpp
@@ -581,8 +581,6 @@ CreateNewDecoderRequests::createDecoderRequests(RequestVector const& finishedCon
     {
         llmReq->mSamplingConfig.normalizeLogProbs = mIsNormalizeLogProbs;
 
-        auto& dJointInput = decoderState.getJointDecodingInput();
-
         TLLM_CHECK(llmReq->mSeqSlot.has_value());
         auto const batchSlot = llmReq->mSeqSlot.value();
         auto const batchSize = decoderState.getMaxBatchSize();
@@ -595,7 +593,7 @@ CreateNewDecoderRequests::createDecoderRequests(RequestVector const& finishedCon
         TLLM_CHECK_WITH_INFO(beamWidth <= maxBeamWidth,
             tc::fmtstr("Beam width (%d) must be smaller than maxBeamWidth (%d) passed to decoder setup function.",
                 beamWidth, maxBeamWidth));
-        dJointInput.beamWidths.at(batchSlot) = beamWidth;
+        decoderState.setBeamWidth(batchSlot, beamWidth);
 
         auto const promptLen = llmReq->getPromptLen();
 
@@ -626,6 +624,8 @@ CreateNewDecoderRequests::createDecoderRequests(RequestVector const& finishedCon
             decoderRequest.generatedTokensPerEngineStep = modelConfig.getMaxDecodingTokens();
         }
 
+        auto& dJointInput = decoderState.getJointDecodingInput();
+
         auto const numDecodingEngineTokens = decoderRequest.generatedTokensPerEngineStep;
         initializeInputLengths(dJointInput, batchSlot, promptLen, llmReq->mMaxNewTokens, numDecodingEngineTokens,
             maxSequenceLength, decoderBufferManager);
diff --git a/cpp/tensorrt_llm/runtime/decoderState.cpp b/cpp/tensorrt_llm/runtime/decoderState.cpp
@@ -642,6 +642,11 @@ void DecoderState::setGenerationSteps(std::vector<SizeType32> const& generationS
     mJointDecodingInput->generationSteps = generationSteps;
 }
 
+void DecoderState::setBeamWidth(SizeType32 batchIdx, SizeType32 beamWidth)
+{
+    mJointDecodingInput->beamWidths.at(batchIdx) = beamWidth;
+}
+
 DecodingInput& DecoderState::getJointDecodingInput() const
 {
     return *mJointDecodingInput;

Original file line number	Diff line number	Diff line change
`@@ -642,6 +642,11 @@ void DecoderState::setGenerationSteps(std::vector<SizeType32> const& generationS`
`642`	`642`	`mJointDecodingInput->generationSteps = generationSteps;`
`643`	`643`	`}`
`644`	`644`
	`645`	`+void DecoderState::setBeamWidth(SizeType32 batchIdx, SizeType32 beamWidth)`
	`646`	`+{`
	`647`	`+ mJointDecodingInput->beamWidths.at(batchIdx) = beamWidth;`
	`648`	`+}`
	`649`	`+`
`645`	`650`	`DecodingInput& DecoderState::getJointDecodingInput() const`
`646`	`651`	`{`
`647`	`652`	`return *mJointDecodingInput;`